spark2.1.0-mongodb

1.从MongoDB读取

package com.mongodb.spark;

import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.bson.Document;

import com.mongodb.spark.MongoSpark;
import com.mongodb.spark.rdd.api.java.JavaMongoRDD;

public final class ReadFromMongoDB {

	public static void main(final String[] args) throws InterruptedException {

		SparkSession spark = SparkSession.builder().master("local").appName("MongoSparkConnectorIntro")
				.config("spark.mongodb.input.uri", "mongodb://172.28.34.xxx:27117/wangzs.zhaopin")
				.config("spark.mongodb.output.uri", "mongodb://172.28.34.xxx:27117/wangzs.sparkmongo").getOrCreate();

		// Create a JavaSparkContext using the SparkSession's SparkContext object
		JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());

		/* Start Example: Read data from MongoDB ************************/
		JavaMongoRDD rdd = MongoSpark.load(jsc);
		/* End Example **************************************************/

		// Analyze data from MongoDB
		System.out.println(rdd.count());
		System.out.println(rdd.first().toJson());

		jsc.close();

	}
}

2.写入MongoDB

package com.mongodb.spark;

import com.mongodb.spark.MongoSpark;
import com.mongodb.spark.config.WriteConfig;

import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.SparkSession;

import org.bson.Document;

import static java.util.Arrays.asList;

import java.util.HashMap;
import java.util.Map;

public final class WriteToMongoDBWriteConfig {

	public static void main(final String[] args) throws InterruptedException {

		SparkSession spark = SparkSession.builder().master("local").appName("MongoSparkConnectorIntro")
				.config("spark.mongodb.input.uri", "mongodb://172.28.34.xxx:27117/wangzs.zhaopin")
				.config("spark.mongodb.output.uri", "mongodb://172.28.34.xxx:27117/wangzs.sparkmongo").getOrCreate();

		JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());

		// Create a custom WriteConfig
		Map writeOverrides = new HashMap();
		writeOverrides.put("collection", "spark");
		writeOverrides.put("writeConcern.w", "majority");
		WriteConfig writeConfig = WriteConfig.create(jsc).withOptions(writeOverrides);

		// Create a RDD of 10 documents
		JavaRDD sparkDocuments = jsc.parallelize(asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
				.map(new Function() {
					public Document call(final Integer i) throws Exception {
						return Document.parse("{spark: " + i + ",name:" + i + "}");
					}
				});

		/* Start Example: Save data from RDD to MongoDB *****************/
		MongoSpark.save(sparkDocuments, writeConfig);
		/* End Example **************************************************/

		jsc.close();

	}

}

3.聚合

package com.mongodb.spark;

import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.bson.Document;

import com.mongodb.spark.MongoSpark;
import com.mongodb.spark.rdd.api.java.JavaMongoRDD;

import static java.util.Collections.singletonList;

public final class Aggregation {

	public static void main(final String[] args) throws InterruptedException {

		SparkSession spark = SparkSession.builder().master("local").appName("Aggregation")
				.config("spark.mongodb.input.uri", "mongodb://172.28.34.xxx:27117/wangzs.zhaopin")
				.config("spark.mongodb.output.uri", "mongodb://172.28.34.xxx:27117/wangzs.sparkmongo").getOrCreate();

		// Create a JavaSparkContext using the SparkSession's SparkContext object
		JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());

		// Load and analyze data from MongoDB
		JavaMongoRDD rdd = MongoSpark.load(jsc);

		/* Start Example: Use aggregation to filter a RDD ***************/
		JavaMongoRDD aggregatedRdd = rdd
				.withPipeline(singletonList(Document.parse("{ $match: { 'gzdd' : '上海-普陀区' } }")));
				/* End Example **************************************************/

		// Analyze data from MongoDB
		System.out.println(aggregatedRdd.count());
		System.out.println(aggregatedRdd.collect());

		jsc.close();

	}
}

4.数据集和SQL

/* 1 */
{
    "_id" : ObjectId("5ae911d3460fcf70c940ac96"),
    "name" : "Bilbo Baggins",
    "age" : 50.0
}

/* 2 */
{
    "_id" : ObjectId("5ae911d3460fcf70c940ac97"),
    "name" : "Gandalf",
    "age" : 1000.0
}

/* 3 */
{
    "_id" : ObjectId("5ae911d3460fcf70c940ac98"),
    "name" : "Thorin",
    "age" : 195.0
}

/* 4 */
{
    "_id" : ObjectId("5ae911d3460fcf70c940ac99"),
    "name" : "Balin",
    "age" : 178.0
}

/* 5 */
{
    "_id" : ObjectId("5ae911d3460fcf70c940ac9a"),
    "name" : "Kíli",
    "age" : 77.0
}

/* 6 */
{
    "_id" : ObjectId("5ae911d3460fcf70c940ac9b"),
    "name" : "Dwalin",
    "age" : 169.0
}

/* 7 */
{
    "_id" : ObjectId("5ae911d3460fcf70c940ac9c"),
    "name" : "Óin",
    "age" : 167.0
}

/* 8 */
{
    "_id" : ObjectId("5ae911d3460fcf70c940ac9d"),
    "name" : "Glóin",
    "age" : 158.0
}

/* 9 */
{
    "_id" : ObjectId("5ae911d3460fcf70c940ac9e"),
    "name" : "Fíli",
    "age" : 82.0
}

/* 10 */
{
    "_id" : ObjectId("5ae911d3460fcf70c940ac9f"),
    "name" : "Bombur"
}
package com.mongodb.spark;

import java.io.Serializable;

public final class Character implements Serializable {
	private String name;
	private Integer age;

	public String getName() {
		return name;
	}

	public void setName(String name) {
		this.name = name;
	}

	public Integer getAge() {
		return age;
	}

	public void setAge(final Integer age) {
		this.age = age;
	}
}

package com.mongodb.spark;

import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;

public final class DatasetSQLDemo {

	public static void main(final String[] args) throws InterruptedException {

		SparkSession spark = SparkSession.builder().master("local").appName("Aggregation")
				.config("spark.mongodb.input.uri", "mongodb://172.28.34.xxx:27117/wangzs.sparktest")
				.config("spark.mongodb.output.uri", "mongodb://172.28.34.xxx:27117/wangzs.sparkmongo").getOrCreate();

		// Create a JavaSparkContext using the SparkSession's SparkContext object
		JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());

		// Load data with explicit schema
		Dataset explicitDS = MongoSpark.load(jsc).toDS(Character.class);
		explicitDS.printSchema();
		explicitDS.show();

		// Create the temp view and execute the query
		explicitDS.createOrReplaceTempView("characters");
		Dataset centenarians = spark.sql("SELECT name, age FROM characters WHERE age >= 100");
		centenarians.show();

		// Write the data to the "hundredClub" collection
		MongoSpark.write(centenarians).option("collection", "hundredClub").mode("overwrite").save();

		jsc.close();

	}
}


	4.0.0

	com.wangzs
	spark-2.1.0-learn
	0.0.1-SNAPSHOT
	jar

	spark-2.1.0-learn
	http://maven.apache.org

	
		
			junit
			junit
			4.12
			test
		

		 
			org.apache.spark
			spark-core_2.11
			2.1.0
		

		
			org.mongodb.spark
			mongo-spark-connector_2.11
			2.1.0
		
		
			org.apache.spark
			spark-sql_2.11
			2.1.0
		

	

	
		
			
				
					org.apache.maven.plugins
					maven-compiler-plugin
					
						1.8
						1.8
					
				
				
					org.apache.maven.plugins
					maven-resources-plugin
					
						UTF-8
					
				
				
				
					org.apache.maven.plugins
					maven-surefire-plugin
					
						true
					
				
			
		
	


5.pom文件


	4.0.0

	com.wangzs
	spark-2.1.0-learn
	0.0.1-SNAPSHOT
	jar

	spark-2.1.0-learn
	http://maven.apache.org

	
		
			junit
			junit
			4.12
			test
		

		 
			org.apache.spark
			spark-core_2.11
			2.1.0
		

		
			org.mongodb.spark
			mongo-spark-connector_2.11
			2.1.0
		
		
			org.apache.spark
			spark-sql_2.11
			2.1.0
		

	

	
		
			
				
					org.apache.maven.plugins
					maven-compiler-plugin
					
						1.8
						1.8
					
				
				
					org.apache.maven.plugins
					maven-resources-plugin
					
						UTF-8
					
				
				
				
					org.apache.maven.plugins
					maven-surefire-plugin
					
						true
					
				
			
		
	


你可能感兴趣的:(spark)