创建DataFrame的几种方式

1.读取json格式的文件创建DataFrame

json文件如下:

{"name":"Fanbingbing", "score":100}
{"name":"Xuruyun", "score":99}
{"name":"Liangyongqi", "score":74}

Java代码:

package demo.java.cn;

import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;

public class DataFrameFromJson
{
    public static void main(String[] args)
    {
        SparkConf conf = new SparkConf();
        conf.setMaster("local").setAppName("jsonfile");
        SparkContext sc = new SparkContext(conf);
        //创建sqlContext
        SQLContext sqlContext = new SQLContext(sc);
        //读取json格式的文件
        DataFrame df = sqlContext.read().format("json").load("star.json");
        df.show();//显示 DataFrame中的内容,如果显示多行要指定多少行show(行数)
        df.printSchema();//显示schema信息
        //将DataFrame注册成临时的一张表,这张表临时注册到内存中,不会到磁盘
        df.registerTempTable("startable");
        DataFrame sqlDf = sqlContext.sql("select * from startable where score >80");
        sqlDf.show();
        sc.stop();
    }
}

打印出来的结果:

+-----------+-----+
|       name|score|
+-----------+-----+
|Fanbingbing|  100|
|    Xuruyun|   99|
|Liangyongqi|   74|
+-----------+-----+

root
 |-- name: string (nullable = true)
 |-- score: long (nullable = true)

+-----------+-----+
|       name|score|
+-----------+-----+
|Fanbingbing|  100|
|    Xuruyun|   99|
+-----------+-----+

Scala代码:

package demo.scala.cn

import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}

object DataFrameFromJson {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
    conf.setMaster("local").setAppName("jsonfile")
    val sc=new SparkContext(conf)
    val sqlContext=new SQLContext(sc)
    val df=sqlContext.read.json("star.json")
    df.show() //显示 DataFrame中的内容,如果显示多行要指定多少行show(行数)
    df.printSchema() //显示schema信息
    df.registerTempTable("startable")
    val sqlDf=sqlContext.sql("select * from startable where score >80")
    sqlDf.show()
    sc.stop()
  }
}

2.非Json格式的文件创建DataFrame

数据文件如下:

Fanbingbing,100
Xuruyun,99
Liangyongqi,74

Java代码:

package demo.java.cn;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

import java.util.Arrays;
import java.util.List;

public class DataFrameFromFile
{
    public static void main(String[] args)
    {
        SparkConf conf = new SparkConf();
        conf.setMaster("local").setAppName("rddStruct");
        JavaSparkContext sc = new JavaSparkContext(conf);
        SQLContext sqlContext = new SQLContext(sc);
        JavaRDD linesRDD = sc.textFile("star.txt");
        JavaRDD rowRDD = linesRDD.map(new Function()
        {
            private static final long serialVersionUID = 1L;
            public Row call(String s) throws Exception
            {
                String[] split = s.split(",");
                return RowFactory.create(//这里字段顺序一定要和下边 StructField对应起来
                        String.valueOf(split[0]),
                        Integer.valueOf(split[1])
                );
            }
        });
        List asList = Arrays.asList(
                DataTypes.createStructField("name", DataTypes.StringType, true),
                DataTypes.createStructField("score", DataTypes.IntegerType, true)
        );
        StructType schema = DataTypes.createStructType(asList);
        DataFrame df = sqlContext.createDataFrame(rowRDD, schema);
        df.show();
        //DataFrame再转为RDD
        JavaRDD rowRDD2 = df.javaRDD();
        rowRDD2.foreach(new VoidFunction()
        {
            public void call(Row row) throws Exception
            {
                System.out.print(row.getString(0));
                System.out.println(","+row.getInt(1));
            }
        });
        sc.stop();
    }
}

Scala代码:

package demo.scala.cn

import org.apache.spark.sql.{RowFactory, SQLContext}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.{SparkConf, SparkContext}

object DataFrameFromFile {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
    conf.setMaster("local").setAppName("rddStruct")
    val sc = new SparkContext(conf)
    val sqlContext=new SQLContext(sc)
    val linesRDD = sc.textFile("star.txt")
    val rowRDD = linesRDD.map { x => {
      val split = x.split(",")
      RowFactory.create(split(0), Integer.valueOf(split(1)))
    }}
    val schema = StructType(List(
      StructField("name", StringType, true),
      StructField("score", IntegerType, true)
    ))
    val df=sqlContext.createDataFrame(rowRDD,schema)
    df.show()
    df.printSchema()
    sc.stop()
  }
}

3.将DataFrame存储成parquet文件,保存成parquet的方式有两种:

a.

df.write().mode(SaveMode.Overwrite).format("parquet").save("./sparksql/parquet");

b.

df.write().mode(SaveMode.Overwrite).parquet("./sparksql/parquet");

4.读取parquet文件创建DataFrame

Java代码:

package demo.java.cn;

import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;

public class DataFrameFromParquet
{
    public static void main(String[] args)
    {
        SparkConf conf = new SparkConf();
        conf.setMaster("local").setAppName("fromparquet");
        SparkContext sc = new SparkContext(conf);
        SQLContext sqlContext = new SQLContext(sc);
        DataFrame df = sqlContext.read().parquet("./sparksql/parquet");
        df.show();
        sc.stop();
    }
}

Scala代码:

package demo.scala.cn

import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}

object DataFrameFromParquet {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
    conf.setMaster("local").setAppName("fromparquet")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)
    val df = sqlContext.read.parquet("./sparksql/parquet")
    df.show()
    sc.stop()
  }
}

 5.读取mysql中的数据创建DataFrame

mysql中的数据如下:

mysql> select * from Star;
+-------------+-------+
| name        | score |
+-------------+-------+
| Fanbingbing |   100 |
| Xuruyun     |    99 |
| Liangyongqi |    74 |
+-------------+-------+
3 rows in set (0.00 sec)

 Java代码:

package demo.java.cn;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;

import java.util.HashMap;
import java.util.Map;

public class DataFrameFromMysql
{
    public static void main(String[] args)
    {
        SparkConf conf = new SparkConf();
        conf.setMaster("local").setAppName("mysql");
        JavaSparkContext sc = new JavaSparkContext(conf);
        SQLContext sqlContext = new SQLContext(sc);
        Map options = new HashMap();
        options.put("url", "jdbc:mysql://master.cn:3306/db_spark");
        options.put("driver", "com.mysql.jdbc.Driver");
        options.put("user", "root");
        options.put("password", "123456");
        options.put("dbtable", "Star");
        DataFrame df = sqlContext.read().format("jdbc").options(options).load();
        df.show();
        sc.stop();
    }
}

Scala代码:

package demo.scala.cn

import java.util
import java.util.Properties

import org.apache.spark.sql.{SQLContext, SaveMode}
import org.apache.spark.{SparkConf, SparkContext}

object DataFrameFromMysql {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
    conf.setMaster("local").setAppName("mysql")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)
    val options = new util.HashMap[String, String]()
    options.put("url", "jdbc:mysql://master.cn:3306/db_spark")
    options.put("driver", "com.mysql.jdbc.Driver")
    options.put("user", "root")
    options.put("password", "123456")
    options.put("dbtable", "Star")
    val df = sqlContext.read.format("jdbc").options(options).load()
    df.show()
    //将DataFrame的数据插入mysql
    val properties = new Properties()
    properties.setProperty("user","root")
    properties.setProperty("password","123456");
    df.write.mode(SaveMode.Append).jdbc("jdbc:mysql://master.cn:3306/db_spark","result",properties)
    sc.stop()
  }
}

 

 

 

你可能感兴趣的:(大数据)