JavaTfIdfExample.java
package ml;
import org.apache.spark.ml.feature.HashingTF;
import org.apache.spark.ml.feature.IDF;
import org.apache.spark.ml.feature.IDFModel;
import org.apache.spark.ml.feature.Tokenizer;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import java.util.Arrays;
import java.util.List;
public class JavaTfIdfExample {
public static void main(String[] args) {
SparkSession spark = new SparkSession
.Builder()
.appName("JavaTfIdfExample")
.master("local")
.getOrCreate();
List<Row> data = Arrays.asList(
RowFactory.create(0.0, "Hi I heard about Spark"),
RowFactory.create(0.0, "I wish Java could use case classes"),
RowFactory.create(1.0, "Logistic regression models are neat")
);
// System.out.println(data);
// [[0.0,Hi I heard about Spark],
// [0.0,I wish Java could use case classes],
// [1.0,Logistic regression models are neat]]
StructType schema = new StructType(new StructField[]{
new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
});
Dataset<Row> sentenceData = spark.createDataFrame(data,schema);
// A tokenizer that converts the input string to lowercase
// and then splits it by white spaces.
// Tokenizer是一个transformer,transform方法,转换数据框到另一个数据框(通过应用一个或多个列)
Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
Dataset<Row> wordsData = tokenizer.transform(sentenceData);
wordsData.show(false);
//
// +-----+-----------------------------------+------------------------------------------+
// |label|sentence |words |
// +-----+-----------------------------------+------------------------------------------+
// |0.0 |Hi I heard about Spark |[hi, i, heard, about, spark] |
// |0.0 |I wish Java could use case classes |[i, wish, java, could, use, case, classes]|
// |1.0 |Logistic regression models are neat|[logistic, regression, models, are, neat] |
// +-----+-----------------------------------+------------------------------------------+
int numFeatures = 100;
// HashingTF -> Transformer
// converts dataFrame -> new dataFrame with column appended
HashingTF hashingTF = new HashingTF()
.setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(numFeatures);
Dataset<Row> featurizedData = hashingTF.transform(wordsData);
// featurizedData.select("rawFeatures").show(false);
// +-----------------------------------------+
// |rawFeatures |
// +-----------------------------------------+
// |(20,[0,5,9,17],[1.0,1.0,1.0,2.0]) |
// |(20,[2,7,9,13,15],[1.0,1.0,3.0,1.0,1.0]) |
// |(20,[4,6,13,15,18],[1.0,1.0,1.0,1.0,1.0])|
// +-----------------------------------------+
// IDF->Estimator
IDF idf = new IDF().setInputCol("rawFeatures").setOutputCol("features");
IDFModel idfModel = idf.fit(featurizedData);
Dataset<Row> rescaledData = idfModel.transform(featurizedData);
rescaledData.select("label","features").show(false);
// +-----+----------------------------------------------------------------------------------------------------------------------+
// |label|features |
// +-----+----------------------------------------------------------------------------------------------------------------------+
// |0.0 |(20,[0,5,9,17],[0.6931471805599453,0.6931471805599453,0.28768207245178085,1.3862943611198906]) |
// |0.0 |(20,[2,7,9,13,15],[0.6931471805599453,0.6931471805599453,0.8630462173553426,0.28768207245178085,0.28768207245178085]) |
// |1.0 |(20,[4,6,13,15,18],[0.6931471805599453,0.6931471805599453,0.28768207245178085,0.28768207245178085,0.6931471805599453])|
// +-----+----------------------------------------------------------------------------------------------------------------------+
}
}
结果分析:
对第一行数据,即文档1进行分析(对数以e为底)
HI —> 0; spark —> 5; I —> 9; heard —> 17; about(等价于heard) —> 17;
TFIDF(Hi,d,D) = 1log[(3+1)/(1+1)) = log2 = 0.6931471805599453
TFIDF(spark,d,D) = 1log[(3+1)/(2+1)) = log2 = 0.6931471805599453
TFIDF(I,d,D) = 1log[(3+1)/(1+1)) = log2 = 0.28768207245178085
TFIDF(Heard,d,D) = 2log[(3+1)/(1+1)) = 2 * log2 = 1.3862943611198906
// findSynonyms
// 程序中构造测试数据
package ml.java;
import java.util.Arrays;
import java.util.List;
import org.apache.spark.ml.feature.Word2Vec;
import org.apache.spark.ml.feature.Word2VecModel;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.*;
// $example off$
public class JavaWord2VecExample {
public static void main(String[] args) {
SparkSession spark = SparkSession
.builder()
.master("local")
.appName("JavaWord2VecExample")
.getOrCreate();
// $example on$
// Input data: Each row is a bag of words from a sentence or document.
List<Row> data = Arrays.asList(
RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))),
RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))),
RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" ")))
);
StructType schema = new StructType(new StructField[]{
new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
});
Dataset<Row> documentDF = spark.createDataFrame(data, schema);
// Learn a mapping from words to Vectors.
Word2Vec word2Vec = new Word2Vec()
.setInputCol("text")
.setOutputCol("result")
// .setVectorSize(3)
.setMinCount(0);
Word2VecModel model = word2Vec.fit(documentDF);
Dataset<Row> synonyms = model.findSynonyms("Hi",4);
synonyms.show();
spark.stop();
}
}
java版本
package ml.ETS.T;
import org.apache.spark.ml.feature.StringIndexer;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.sql.Dataset;
import java.util.Arrays;
import java.util.List;
import static org.apache.spark.sql.types.DataTypes.*;
/**
* 添加依赖 spark-mllib_${scala.version}
*
* StringIndexer:字符串索引器,顾名思义,就是给字符串添加索引的
* 1、把字符串标签列转换为ML的标签索引列
* 2、索引值区间[0,numLabels)
* 3、支持4个排序选项frequencyDesc、requencyAsc、alphabetDesc、alphabetAscf
* 4、
*/
public class StringIndexerDemo {
public static void main(String[] args) {
SparkSession spark = SparkSession
.builder()
.master("local")
.appName("StringIndexer")
.getOrCreate();
//第一步:创建DataFrame
List<Row> data = Arrays.asList(
RowFactory.create(0, "a"),
RowFactory.create(1, "b"),
RowFactory.create(2, "c"),
RowFactory.create(3, "a"),
RowFactory.create(4, "a"),
RowFactory.create(5, "c")
);
StructType schema = new StructType(new StructField[]{
createStructField("id", IntegerType, false),
createStructField("category", StringType, false)
});
Dataset<Row> df = spark.createDataFrame(data,schema);
//第二步:特征转换(把字符串标签列转换为标签索引列)
//Estimator
//StringIndexer是一个Estimator(学习算法或训练算法的抽象)
//Estimator的fit方法,接收一个DF并且生成一个StringIndexerModel(该模型是一个Transformer)
//Transformer:特征转换和学习模型的抽象
//特征转换:读一列,映射一个新列,输出一个新的DF(追加了新列)
//学习模型:读包含特征向量的列,为每个特征向量预测标签,输出一个新的DF(追加了预测标签列)
StringIndexer indexer = new StringIndexer()
.setInputCol("category")
.setOutputCol("categoryIndex");
Dataset<Row> indexed = indexer.fit(df).transform(df);
indexed.show();
spark.stop();
/**
* +---+--------+-------------+
| id|category|categoryIndex|
+---+--------+-------------+
| 0| a| 0.0|
| 1| b| 2.0|
| 2| c| 1.0|
| 3| a| 0.0|
| 4| a| 0.0|
| 5| c| 1.0|
+---+--------+-------------+
结果分析:
a,出现的次数最多,对应索引0,紧接着是c和b,对应索引1和2
*/
}
}
主要作用:提高决策树或随机森林等ML方法的分类效果
VectorIndexer对数据集向量中的类别(离散值)特征进行编号,它能够自动判断那些特征是离散值型的特征,并对他们进行编号,具体做法:
1、对向量类型的输入列设置maxCategories;
2、特征向量中某一个特征不重复取值个数小于maxCategories,则被重新编号[0,maxCategories);
某一个特征不重复取值个数大于maxCategories,则该特征视为连续值,不会重新编号(不会发生任何改变)。
java版本
import java.util.Map;
import org.apache.spark.ml.feature.VectorIndexer;
import org.apache.spark.ml.feature.VectorIndexerModel;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.Dataset;
public class VectorIndexerDemo {
public static void main(String[] args) {
SparkSession spark = SparkSession
.builder()
.master("local")
.appName("StringIndexer")
.getOrCreate();
Dataset<Row> data = spark.read().format("libsvm").load("E:/bigdata/Spark/data/ml/sample_libsvm_data.txt");
// Automatically identify categorical features, and index them.
VectorIndexer indexer = new VectorIndexer()
.setInputCol("features")
.setOutputCol("indexed")
.setMaxCategories(3);
VectorIndexerModel indexerModel = indexer.fit(data);
Map<Integer, Map<Double, Integer>> categoryMaps = indexerModel.javaCategoryMaps();
System.out.println("Chose " + categoryMaps.size() + " categorical features:");
// Create new column "indexed" with categorical values transformed to indices
Dataset<Row> indexedData = indexerModel.transform(data);
indexedData.show(false);
// $example off$
spark.stop();
}
}
java版本
输出结果如下:
+-----+-------------------------+-------------------------+
|label|features |indexed |
+-----+-------------------------+-------------------------+
|0.0 |(3,[0,1,2],[2.0,5.0,7.0])|(3,[0,1,2],[2.0,1.0,7.0])|
|1.0 |(3,[0,1,2],[3.0,5.0,9.0])|(3,[0,1,2],[3.0,1.0,9.0])|
|1.0 |(3,[0,1,2],[1.0,5.0,8.0])|(3,[0,1,2],[1.0,1.0,8.0])|
|0.0 |(3,[0,1,2],[2.0,5.0,7.0])|(3,[0,1,2],[2.0,1.0,7.0])|
|1.0 |(3,[0,1,2],[5.0,1.0,6.0])|(3,[0,1,2],[5.0,0.0,6.0])|
|1.0 |(3,[0,1,2],[3.0,5.0,7.0])|(3,[0,1,2],[3.0,1.0,7.0])|
+-----+-------------------------+-------------------------+
结果分析:
特征向量包含3个特征:0,1,2
特征0对应的值有4(1,2,3,5)个,大于设置的3,视为连续的,不进行转换
特征1对应的值有2(1,5)个,不大于设置的2,进行转换:
1 ---> 0.0
5 ---> 1.0
特征2对应的值有4(6,7,8,9)个,大于设置的2,视为连续的,不进行转换