spark官网
DT大数据梦工厂
spark sql操作各种数据源的数据流转 :
各种数据源的输入 => RDD(lines) =>RDD(Rows) => DataFrame(注册临时表) => 分析与过滤(各种sql操作、机器学习等)=> RDD(Row) => 各种格式的输出
Spark sql怎么操作各种数据源: json格式的文件、hive、mysql、hbase等
代码
package main.scala
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.types.IntegerType
import org.apache.spark.sql.functions._
/**
* 使用spark sql 内置函数对数据进行分析
* 内置函数返回一个列Column
* 分类:
* 1、聚合函数 2、集合函数 3、日期、时间函数 4、数学函数 5、开窗函数 6、字符串处理函数 7、其他
*/
object sqlagg {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("sparkinnerfunctions")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val sqlContext = new SQLContext(sc)
/*
* 1. 获取数据源 - RDD(line)
*/
val userData = Array(
"2016-5-29,001,http://spark.apache.org/,1000",
"2016-5-20,001,http://spark.apache.org/,1090",
"2016-5-20,001,http://spark.apache.org/,1060",
"2016-5-30,002,http://spark.apache.org/,1000",
"2016-5-30,003,http://spark.apache.org/,1000",
"2016-5-10,003,http://spark.apache.org/,1020",
"2016-5-10,003,http://spark.apache.org/,1020",
"2016-5-10,003,http://spark.apache.org/,1000"
)
var dataRDD = sc.parallelize(userData)
/*
* 2. 转换成 RDD(Row)
*/
val rowRDD = dataRDD.map(line => {
val splited = line.split(",")
Row(splited(0),splited(1),splited(2),splited(3))
})
/*
* 3. 指定Row的数据结构 并生成 DataFrame
*/
val structTypes = StructType( Array(
StructField("time",StringType,true),
StructField("userid",StringType,true),
StructField("url",StringType,true),
StructField("amount",StringType,true)
) )
val userDataDF = sqlContext.createDataFrame(rowRDD, structTypes)
/*
* 4 . 使用Spark SQL提供的内置函数对DataFrame进行操作(需要导入相关隐式转换信息)
* :内置函数生成column对象
*/
import sqlContext.implicits._
//按日期分类,然后进行聚合操作: 去重 userid, 计算每天的销售总量
userDataDF.groupBy("time").agg('time, countDistinct('userid)).map(row=>Row(row(1),row(2))).collect().foreach(println)
userDataDF.groupBy("time").agg('time, sum('amount)).show()
}
}
执行结果
[2016-5-10,1]
[2016-5-20,1]
[2016-5-29,1]
[2016-5-30,2]
+---------+---------+-----------+
| time| time|sum(amount)|
+---------+---------+-----------+
|2016-5-10|2016-5-10| 3040.0|
|2016-5-20|2016-5-20| 2150.0|
|2016-5-29|2016-5-29| 1000.0|
|2016-5-30|2016-5-30| 2000.0|
+---------+---------+-----------+
代码
package main.scala
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
/**
- spark sql操作 本地json文件
*/
object DataFrameOps {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("DataFram Ops")
val sqlContext = new SQLContext(new SparkContext(conf))
val df = sqlContext.read.json("file:///home/hadoop/spark-1.6.0-bin-hadoop2.6/examples/src/main/resources/people.json")
df.show
df.printSchema
df.select("name").show
df.select("name", "age").show
df.select(df("name"),df("age")+10).show
df.filter(df("age")>10).show
}
}
执行结果
+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+
root
|-- age: long (nullable = true)
|-- name: string (nullable = true)
+----+
|name|
+----+
|Andy|
+----+
+----+---+
|name|age|
+----+---+
|Andy| 30|
+----+---+
+----+----------+
|name|(age + 10)|
+----+----------+
|Andy| 40|
+----+----------+
+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+
二. Hive
package cool.pengych.spark.sql;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.hive.HiveContext;
/**
* SparkSQL OPS on Hive
* @author pengyucheng
*
*/
public class SparkSQL2Hive
{
public static void main(String[] args)
{
SparkConf config = new SparkConf().setAppName("SparkSQL2Hive");
SparkContext sc = new SparkContext(config);
HiveContext hiveContext = new HiveContext(sc);
hiveContext.sql("use hive");
hiveContext.sql("DROP TABLE IF EXISTS people");
hiveContext.sql("CREATE TABLE IF NOT EXISTS people(name STRING,age INT) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n'");
/*
* (把本地数据加载到Hive数据仓库中(背后实际上发生了数据的拷贝),
* 当然也可以通过LOAD DATA INPATH去获取HDFS等上的数据到Hive(此时发生了数据的移动)
*/
hiveContext.sql("LOAD DATA LOCAL INPATH '/home/pengyucheng/resource/people.txt' INTO TABLE people ");
hiveContext.sql("DROP TABLE IF EXISTS peoplescores");
hiveContext.sql("CREATE TABLE IF NOT EXISTS peoplescores(name STRING,score INT) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LINES TERMINATED BY '\n'");
hiveContext.sql("LOAD DATA LOCAL INPATH '/home/pengyucheng/resource/peoplescores.txt' INTO TABLE peoplescores");
/*
* 通过HiveContext使用join直接基于Hive中的两张表进行操作
*/
DataFrame resultDF = hiveContext.sql("SELECT pi.name,pi.age,ps.score FROM people pi JOIN peoplescores ps ON pi.name = ps.name WHERE ps.score > 75");
/*
* 通过saveAsTable创建一张 Hive Managerd Table,数据的元数据和数据的具体位置都是由Hive数据仓库
* 进行管理的,当删除该表的时候,数据也会一起被删除(磁盘上的数据不再存在)
*/
hiveContext.sql("DROP TABLE IF EXISTS peopleinformationresult");
resultDF.saveAsTable("peopleinformationresult");
/*
* 使用HiveContext的table方法可以直接读取Hive数据仓库中的Table并生成DataFrame,接下来就可以进行机器学习、图计算
* 等各种复杂ETL操作
*/
DataFrame dataFromHive = hiveContext.table("peopleinformationresult");
dataFromHive.show();
}
}
package cool.pengych.spark.sql;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.DataFrameReader;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import scala.Tuple2;
/**
* SparkSQL 通过 JDBC 操作 MySQL数据库
* @author pengyucheng
*
*/
public class SparkSQLJDBC2MySQL
{
public static void main(String[] args)
{
SparkConf config = new SparkConf().setMaster("local[*]").setAppName("SparkSQLJDBC2MySQL");
SparkContext sc = new SparkContext(config);
sc.addJar("/home/hadoop/spark-1.6.0-bin-hadoop2.6/lib/mysql-connector-java-5.1.39-bin.jar");
SQLContext sqlContext = new SQLContext(sc);
/*
* 1、连接数据库:通过format(“jdbc”)的方式说明SparkSQL操作的数据来源是通过JDBC获得,JDBC后端
* 一般都是数据库,eg、MySQL; 传递相关DB链接信息
*/
DataFrameReader reader = sqlContext.read().format("jdbc");
reader.option("url", "jdbc:mysql://112.74.21.122:3306/hive");
reader.option("driver", "com.mysql.jdbc.Driver");
reader.option("dbtable", "spark");
reader.option("user", "hive");
reader.option("password", "hive");
/*
* 2、加载相关数据
*/
DataFrame sparkDF = reader.load();
reader.option("dbtable", "hadoop");
DataFrame hadoopDF = reader.load();
/*
* 3、用Spark core组织待处理的数据:这里以进行join操作(DataFrame转换成 RDD后进行)为例
*/
JavaPairRDD> resultRDD = sparkDF.javaRDD().mapToPair(new PairFunction() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2 call(Row row) throws Exception {
return new Tuple2(String.valueOf(row.getAs("name")), (Integer) row.getAs("age"));
}
}).join(hadoopDF.javaRDD().mapToPair(new PairFunction() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2 call(Row row) throws Exception {
return new Tuple2(String.valueOf(row.getAs("name")), (Integer)row.getAs("score"));
}
}));
/*
* 4、将组织好的数据交给 DataFrame 做业务处理 - 可以利用 Spark SQL 、Core、ML等进行复杂的操作!!!
*/
// 获取 JavaRDD
JavaRDD resultRowRDD = resultRDD.map(new Function>, Row>() {
private static final long serialVersionUID = 1L;
@Override
public Row call(Tuple2> tuple) throws Exception {
return RowFactory.create(tuple._1,tuple._2._1,tuple._2._2);
}
});
//构建StructType,用于最后DataFrame元数据的描述
List fields = new ArrayList();
fields.add(DataTypes.createStructField("name", DataTypes.StringType, true));
fields.add(DataTypes.createStructField("age", DataTypes.IntegerType, true));
fields.add(DataTypes.createStructField("score", DataTypes.IntegerType, true));
StructType type = DataTypes.createStructType(fields);
DataFrame personsDF = sqlContext.createDataFrame(resultRowRDD, type);
// 具体业务处理 - 这里只是简单的 show 一下
System.out.println("========================业务处理开始:ML,图计算等工具处理=================");
System.out.println("==== start showing ====");
personsDF.show();
System.out.println("========================业务处理结束:ML,图计算等工具处理=================");
/*
* 5、保存处理后的数据:可以存放到hive,db等数据仓库中
*/
personsDF.javaRDD().foreachPartition(new VoidFunction>() {
private static final long serialVersionUID = 1L;
@Override
public void call(Iterator iterator) throws SQLException
{
Connection conn = null;
StringBuilder sql = new StringBuilder("INSERT INTO dfperson VALUES ( ");
while(iterator.hasNext())
{
Row row = iterator.next();
sql.append(String.valueOf(row.getAs("name"))).append(",").append(row.getInt(1)).append(",").append(row.getInt(2));
}
sql.append(")");
try
{
conn = DriverManager.getConnection("jdbc:mysql://112.74.21.122:3306/hive", "hive", "hive");
boolean flag = conn.createStatement().execute(sql.toString());
}
catch (SQLException e)
{
e.printStackTrace();
}
finally
{
if(null != conn) conn.close();
}
}
});
}
}
package cool.pengych.spark.sql;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
/**
* @author pengyucheng
* 通过JDBC访问Thrift Server,进而访问Spark SQL,进而访问Hive
*
*/
public class SparkSQLJDBC2ThriftServer
{
public static void main(String[] args) throws ClassNotFoundException, SQLException
{
String sql = "select name from people wher age = ? ";
Class.forName("org.apache.hive.jdbc.HiveDriver");
Connection conn = DriverManager.getConnection("jdbc:hive2://localhost:10000/default?hive.server2.transport.mode=http;hive.server2.thrift.http.path=cliservice",
"root","");
conn.prepareStatement(sql);
PreparedStatement preparedStatement = conn.prepareStatement(sql);
preparedStatement.setInt(0, 27);
ResultSet rs = preparedStatement.executeQuery();
while(rs.next())
{
System.out.println(rs.getString(1));
}
conn.close();
}
}
package cool.pengych.spark.sql;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
/**
* Spark SQL操作 Parquet 格式的文件内容
* @author pengyucheng
*
*/
public class SparkSQLParquet
{
public static void main(String[] args) {
/*
* 创建SQLContext
*/
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("SparkSQLParquet");
JavaSparkContext jsc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(jsc);
/*
* 注册成为临时表以供后续SQL查询操作
*/
DataFrame df = sqlContext.read().parquet("file:///home/hadoop/spark-1.6.0-bin-hadoop2.6/examples/src/main/resources/users.parquet");
df.registerTempTable("users");
/*
* 进行数据的多维度分析
*/
DataFrame result = sqlContext.sql("select name from users");
JavaRDD strs = result.javaRDD().map(new Function() {
@Override
public String call(Row row) throws Exception {
return "The name is :"+row.getAs("name");
}
});
/*
* 对结果进行处理
*/
List listRow = strs.collect();
for (String row : listRow) {
System.out.println(row);
}
}
}
用蓬勃的生命力战胜人性的悲剧性!package cool.pengych.spark.sql;