1.读取json格式的文件创建DataFrame
json文件如下:
{"name":"Fanbingbing", "score":100}
{"name":"Xuruyun", "score":99}
{"name":"Liangyongqi", "score":74}
Java代码:
package demo.java.cn;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
public class DataFrameFromJson
{
public static void main(String[] args)
{
SparkConf conf = new SparkConf();
conf.setMaster("local").setAppName("jsonfile");
SparkContext sc = new SparkContext(conf);
//创建sqlContext
SQLContext sqlContext = new SQLContext(sc);
//读取json格式的文件
DataFrame df = sqlContext.read().format("json").load("star.json");
df.show();//显示 DataFrame中的内容,如果显示多行要指定多少行show(行数)
df.printSchema();//显示schema信息
//将DataFrame注册成临时的一张表,这张表临时注册到内存中,不会到磁盘
df.registerTempTable("startable");
DataFrame sqlDf = sqlContext.sql("select * from startable where score >80");
sqlDf.show();
sc.stop();
}
}
打印出来的结果:
+-----------+-----+
| name|score|
+-----------+-----+
|Fanbingbing| 100|
| Xuruyun| 99|
|Liangyongqi| 74|
+-----------+-----+
root
|-- name: string (nullable = true)
|-- score: long (nullable = true)
+-----------+-----+
| name|score|
+-----------+-----+
|Fanbingbing| 100|
| Xuruyun| 99|
+-----------+-----+
Scala代码:
package demo.scala.cn
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
object DataFrameFromJson {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local").setAppName("jsonfile")
val sc=new SparkContext(conf)
val sqlContext=new SQLContext(sc)
val df=sqlContext.read.json("star.json")
df.show() //显示 DataFrame中的内容,如果显示多行要指定多少行show(行数)
df.printSchema() //显示schema信息
df.registerTempTable("startable")
val sqlDf=sqlContext.sql("select * from startable where score >80")
sqlDf.show()
sc.stop()
}
}
2.非Json格式的文件创建DataFrame
数据文件如下:
Fanbingbing,100
Xuruyun,99
Liangyongqi,74
Java代码:
package demo.java.cn;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import java.util.Arrays;
import java.util.List;
public class DataFrameFromFile
{
public static void main(String[] args)
{
SparkConf conf = new SparkConf();
conf.setMaster("local").setAppName("rddStruct");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
JavaRDD linesRDD = sc.textFile("star.txt");
JavaRDD rowRDD = linesRDD.map(new Function()
{
private static final long serialVersionUID = 1L;
public Row call(String s) throws Exception
{
String[] split = s.split(",");
return RowFactory.create(//这里字段顺序一定要和下边 StructField对应起来
String.valueOf(split[0]),
Integer.valueOf(split[1])
);
}
});
List asList = Arrays.asList(
DataTypes.createStructField("name", DataTypes.StringType, true),
DataTypes.createStructField("score", DataTypes.IntegerType, true)
);
StructType schema = DataTypes.createStructType(asList);
DataFrame df = sqlContext.createDataFrame(rowRDD, schema);
df.show();
//DataFrame再转为RDD
JavaRDD rowRDD2 = df.javaRDD();
rowRDD2.foreach(new VoidFunction()
{
public void call(Row row) throws Exception
{
System.out.print(row.getString(0));
System.out.println(","+row.getInt(1));
}
});
sc.stop();
}
}
Scala代码:
package demo.scala.cn
import org.apache.spark.sql.{RowFactory, SQLContext}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.{SparkConf, SparkContext}
object DataFrameFromFile {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local").setAppName("rddStruct")
val sc = new SparkContext(conf)
val sqlContext=new SQLContext(sc)
val linesRDD = sc.textFile("star.txt")
val rowRDD = linesRDD.map { x => {
val split = x.split(",")
RowFactory.create(split(0), Integer.valueOf(split(1)))
}}
val schema = StructType(List(
StructField("name", StringType, true),
StructField("score", IntegerType, true)
))
val df=sqlContext.createDataFrame(rowRDD,schema)
df.show()
df.printSchema()
sc.stop()
}
}
3.将DataFrame存储成parquet文件,保存成parquet的方式有两种:
a.
df.write().mode(SaveMode.Overwrite).format("parquet").save("./sparksql/parquet");
b.
df.write().mode(SaveMode.Overwrite).parquet("./sparksql/parquet");
4.读取parquet文件创建DataFrame
Java代码:
package demo.java.cn;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
public class DataFrameFromParquet
{
public static void main(String[] args)
{
SparkConf conf = new SparkConf();
conf.setMaster("local").setAppName("fromparquet");
SparkContext sc = new SparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
DataFrame df = sqlContext.read().parquet("./sparksql/parquet");
df.show();
sc.stop();
}
}
Scala代码:
package demo.scala.cn
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
object DataFrameFromParquet {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local").setAppName("fromparquet")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
val df = sqlContext.read.parquet("./sparksql/parquet")
df.show()
sc.stop()
}
}
5.读取mysql中的数据创建DataFrame
mysql中的数据如下:
mysql> select * from Star;
+-------------+-------+
| name | score |
+-------------+-------+
| Fanbingbing | 100 |
| Xuruyun | 99 |
| Liangyongqi | 74 |
+-------------+-------+
3 rows in set (0.00 sec)
Java代码:
package demo.java.cn;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
import java.util.HashMap;
import java.util.Map;
public class DataFrameFromMysql
{
public static void main(String[] args)
{
SparkConf conf = new SparkConf();
conf.setMaster("local").setAppName("mysql");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(sc);
Map options = new HashMap();
options.put("url", "jdbc:mysql://master.cn:3306/db_spark");
options.put("driver", "com.mysql.jdbc.Driver");
options.put("user", "root");
options.put("password", "123456");
options.put("dbtable", "Star");
DataFrame df = sqlContext.read().format("jdbc").options(options).load();
df.show();
sc.stop();
}
}
Scala代码:
package demo.scala.cn
import java.util
import java.util.Properties
import org.apache.spark.sql.{SQLContext, SaveMode}
import org.apache.spark.{SparkConf, SparkContext}
object DataFrameFromMysql {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local").setAppName("mysql")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
val options = new util.HashMap[String, String]()
options.put("url", "jdbc:mysql://master.cn:3306/db_spark")
options.put("driver", "com.mysql.jdbc.Driver")
options.put("user", "root")
options.put("password", "123456")
options.put("dbtable", "Star")
val df = sqlContext.read.format("jdbc").options(options).load()
df.show()
//将DataFrame的数据插入mysql
val properties = new Properties()
properties.setProperty("user","root")
properties.setProperty("password","123456");
df.write.mode(SaveMode.Append).jdbc("jdbc:mysql://master.cn:3306/db_spark","result",properties)
sc.stop()
}
}