groupId = org.apache.spark
artifactId = spark-hive_2.10
version = 1.2.0
如果你不能引入Hive依赖,那就应该使用工件spark-sql_2.10代替spark-hive_2.10
hive.metastore.warehouse.dir
/user/map_navi_spark/hive/warehouse
location of default database for the warehouse
hive.metastore.uris
thrift://10.153.53.249:9083
hive.exec.scratchdir
/user/map_navi_spark/hive/data-scratchdir
Scratch space for Hive jobs
hive.exec.stagingdir
/user/map_navi_spark/hive/data-stagingdir
hive.metastore.client.socket.timeout
60
hive.cache.expr.evaluation
false
cache evaluation will make sogou common-lib udfs not work, so disable it.
// val sc = new SparkContext(conf)
// val hiveCtx = new HiveContext(sc)
// //"Use SparkSession.builder.enableHiveSupport instead", "2.0.0"
// val input = hiveCtx.jsonFile("inputFile")
// //注册输入的SchemaRDD
// input.registerTempTable("tweets")
// //依据retweetCount(转发计数)选出推文
// val topTweets = hiveCtx.sql("SELECT text,retweetCount FROM tweets ORDER BY retweetCount LIMIT 10")
val spark: SparkSession = SparkSession.builder.
appName("retweetCount").
master("local[3]").
enableHiveSupport().
config(conf).
getOrCreate
// import spark.implicits._
// import spark.sql
// sql("SELECT text,retweetCount FROM tweets ORDER BY retweetCount LIMIT 10")
//上下文对象
val sc: SparkContext = spark.sparkContext
//sqlContext
val hiveCtx: SQLContext = spark.sqlContext
val input = hiveCtx.read.json("inputFile")
//注册输入的SchemaRDD
input.createOrReplaceTempView("tweets")
//依据retweetCount(转发计数)选出推文
val topTweets = hiveCtx.sql("SELECT text,retweetCount FROM tweets ORDER BY retweetCount LIMIT 10")
val spark: SparkSession = SparkSession.builder.
appName("retweetCount").
master("local[3]").
enableHiveSupport().
config(conf).
getOrCreate
//上下文对象
val sc: SparkContext = spark.sparkContext
//sqlContext
val hiveCtx: SQLContext = spark.sqlContext
val rows = hiveCtx.sql("SELECT key,value FROM mytable")
val keys = rows.map(row => row.getInt(0))
val spark: SparkSession = SparkSession.builder.
appName("retweetCount").
master("local[3]").
enableHiveSupport().
config(new SparkConf()).
getOrCreate
val df = spark.read.parquet("inputpath")
val passLinkDF = passLinkSaved.toDF()
passLinkDF.write.format("parquet").mode(SaveMode.Append).partitionBy("cdate").parquet("/user/map_navi_spark/stat/q4_all_passlink_parquet")
root
|-- tripID: string (nullable = true)
|-- from: struct (nullable = true)
| |-- status: string (nullable = true)
| |-- tripID: string (nullable = true)
| |-- gpsTm: long (nullable = true)
| |-- originX: double (nullable = true)
| |-- originY: double (nullable = true)
| |-- prjX: double (nullable = true)
| |-- prjY: double (nullable = true)
| |-- link: struct (nullable = true)
| | |-- id: integer (nullable = true)
| | |-- direct: integer (nullable = true)
| | |-- length: double (nullable = true)
| | |-- trip_index: integer (nullable = true)
| |-- distError: double (nullable = true)
| |-- angleError: double (nullable = true)
|-- to: struct (nullable = true)
| |-- status: string (nullable = true)
| |-- tripID: string (nullable = true)
| |-- gpsTm: long (nullable = true)
| |-- originX: double (nullable = true)
| |-- originY: double (nullable = true)
| |-- prjX: double (nullable = true)
| |-- prjY: double (nullable = true)
| |-- link: struct (nullable = true)
| | |-- id: integer (nullable = true)
| | |-- direct: integer (nullable = true)
| | |-- length: double (nullable = true)
| | |-- trip_index: integer (nullable = true)
| |-- distError: double (nullable = true)
| |-- angleError: double (nullable = true)
|-- v_prop: double (nullable = true)
|-- seqLink: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- id: integer (nullable = true)
| | |-- direct: integer (nullable = true)
| | |-- length: double (nullable = true)
| | |-- trip_index: integer (nullable = true)
|-- length: double (nullable = true)
|-- passspeed: double (nullable = true)
case class HappyPerson(handle: String,favouriteBeverage: String)
val happyPersonRdd = sc.parallelize(List("holden","coffee"))
import spark.implicits._
val happyPersonDf = happyPersonRdd.toDF
happyPersonDf.createOrReplaceTempView("viewName")
//接下来就可以使用sql语句进行查询了
属性名称和含义
partitionColumn, lowerBound, upperBound, numPartitions:
fetchsize:仅适用于read数据。JDBC提取大小,用于确定每次获取的行数。这可以帮助JDBC驱动程序调优性能,这些驱动程序默认具有较低的提取大小(例如,Oracle每次提取10行)。
batchsize:仅适用于write数据。JDBC批量大小,用于确定每次insert的行数。
这可以帮助JDBC驱动程序调优性能。默认为1000。
isolationLevel:仅适用于write数据。事务隔离级别,适用于当前连接。它可以是一个NONE,READ_COMMITTED,READ_UNCOMMITTED,REPEATABLE_READ,或SERIALIZABLE,对应于由JDBC的连接对象定义,缺省值为标准事务隔离级别READ_UNCOMMITTED。请参阅文档java.sql.Connection。
truncate:仅适用于write数据。当SaveMode.Overwrite启用时,此选项会truncate在MySQL中的表,而不是删除,再重建其现有的表。这可以更有效,并且防止表元数据(例如,索引)被去除。但是,在某些情况下,例如当新数据具有不同的模式时,它将无法工作。它默认为false。
createTableOptions:仅适用于write数据。此选项允许在创建表(例如CREATE TABLE t (name string) ENGINE=InnoDB.)时设置特定的数据库表和分区选项。
val jdbcDF1 = spark.read.format("jdbc")
.option("driver", "com.mysql.jdbc.Driver")
.option("url", "jdbc:mysql://ip:3306")
.option("dbtable", "db.user_test")
.option("user", "test")
.option("password", "123456")
.option("fetchsize", "3")
.load()
jdbcDF1.show
val jdbcDF2 = spark.read.format("jdbc").options(
Map(
"driver" -> "com.mysql.jdbc.Driver",
"url" -> "jdbc:mysql://ip:3306",
"dbtable" -> "db.user_test",
"user" -> "test",
"password" -> "123456",
"fetchsize" -> "3")).load()
jdbcDF2.show
jdbc(url: String, table: String, properties: Properties): DataFrame
import java.util.Properties
// jdbc(url: String, table: String, properties: Properties): DataFrame
val readConnProp1 = new Properties()
readConnProp1.put("driver", "com.mysql.jdbc.Driver")
readConnProp1.put("user", "test")
readConnProp1.put("password", "123456")
readConnProp1.put("fetchsize", "3")
val jdbcDF3 = spark.read.jdbc(
"jdbc:mysql://ip:3306",
"db.user_test",
readConnProp1)
jdbcDF3.rdd.partitions.size //默认并行度为1
jdbcDF3.show
val jdbcDF4 = spark.read.jdbc(
"jdbc:mysql://ip:3306",
"(select * from db.user_test where gender=1) t", // 注意括号和表别名,必须得有,这里可以过滤数据
readConnProp1)
jdbcDF4.show()
jdbc(url: String, table: String,
columnName: String, lowerBound: Long, upperBound: Long, numPartitions: Int,
connectionProperties: Properties): DataFrame
import java.util.Properties
val readConnProp2 = new Properties()
readConnProp2.put("driver", "com.mysql.jdbc.Driver")
readConnProp2.put("user", "test")
readConnProp2.put("password", "123456")
readConnProp2.put("fetchsize", "2")
val columnName = "uid"
val lowerBound = 1
val upperBound = 6
val numPartitions = 3
val jdbcDF5 = spark.read.jdbc(
"jdbc:mysql://ip:3306",
"db.user_test",
columnName,
lowerBound, //lowerBound和upperBound仅用于决定分区的大小,而不是用于过滤表中的行。表中的所有行将被分割并返回。
upperBound,
numPartitions,
readConnProp2)
jdbcDF5.rdd.partitions.size //并行度为3,对应于numPartitions
jdbcDF5.show
dbc(url: String, table: String, predicates: Array[String], connectionProperties: Properties): DataFrame
predicates: Condition in the WHERE clause for each partition.
import java.util.Properties
val readConnProp3 = new Properties()
readConnProp3.put("driver", "com.mysql.jdbc.Driver")
readConnProp3.put("user", "test")
readConnProp3.put("password", "123456")
readConnProp3.put("fetchsize", "2")
val arr = Array(
(1, 50),
(2, 60))
// 此处的条件,既可以分割数据用作并行度,也可以过滤数据
val predicates = arr.map {
case (gender, age) =>
s" gender = $gender " + s" AND age < $age "
}
val predicates1 =
Array(
"2017-05-01" -> "2017-05-20",
"2017-06-01" -> "2017-06-05").map {
case (start, end) =>
s"cast(create_time as date) >= date '$start' " + s"AND cast(create_time as date) <= date '$end'"
}
val jdbcDF6 = spark.read.jdbc(
"jdbc:mysql://ip:3306",
"db.user_test",
predicates,
readConnProp3)
jdbcDF6.show
// For implicit conversions like converting RDDs to DataFrames
import spark.implicits._
val dataList: List[(Double, String, Double, Double, String, Double, Double, Double, Double)] = List(
(0, "male", 37, 10, "no", 3, 18, 7, 4),
(0, "female", 27, 4, "no", 4, 14, 6, 4),
(0, "female", 32, 15, "yes", 1, 12, 1, 4),
(0, "male", 57, 15, "yes", 5, 18, 6, 5),
(0, "male", 22, 0.75, "no", 2, 17, 6, 3),
(0, "female", 32, 1.5, "no", 2, 17, 5, 5),
(0, "female", 22, 0.75, "no", 2, 12, 1, 3),
(0, "male", 57, 15, "yes", 2, 14, 4, 4),
(0, "female", 32, 15, "yes", 4, 16, 1, 2))
val colArray: Array[String] = Array("affairs", "gender", "age", "yearsmarried", "children", "religiousness", "education", "occupation", "rating")
val df7 = dataList.toDF(colArray: _*) //将colArray当做参数序列seq处理
df7.write.mode("overwrite").format("jdbc").options(
Map(
"driver" -> "com.mysql.jdbc.Driver",
"url" -> "jdbc:mysql://ip:3306",
"dbtable" -> "db.affairs",
"user" -> "test",
"password" -> "123456",
"batchsize" -> "1000",
"truncate" -> "true")).save()
val sparkSession = SparkSession.builder()
.appName("metric_speed_sample")
.config(conf)
.config("spark.sql.parquet.compression.codec", "snappy")
.getOrCreate()
在Scala中打开codegen选项
conf.set("spark.sql.codegen","true")