CREATE USER 'spark'@'%' IDENTIFIED BY '123456';
GRANT all privileges ON hive.* TO 'spark'@'%';
flush privileges;
启动spark-shell时指定mysql连接驱动位置
bin/spark-shell --master spark://mini1:7077 --executor-memory 1g --total-executor-cores 2 --driver-class-path /home/hadoop/spark/lib/mysql-connector-java-5.1.35-bin.jar
Spark SQL是Spark用来处理结构化数据的一个模块,它提供了一个编程抽象叫做DataFrame并且作为分布式SQL查询引擎的作用。
from pyspark import Row
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StringType, StructType
if __name__ == "__main__":
spark = SparkSession\
.builder\
.appName("PythonWordCount")\
.master("local")\
.getOrCreate()
sc = spark.sparkContext
line = sc.textFile("D:\\code\\hadoop\\data\\spark\\day4\\person.txt").map(lambda x: x.split(' '))
# personRdd = line.map(lambda p: Row(id=p[0], name=p[1], age=int(p[2])))
# personRdd_tmp = spark.createDataFrame(personRdd)
# personRdd_tmp.show()
#读取数据
schemaString = "id name age"
fields = list(map(lambda fieldName: StructField(fieldName, StringType(), nullable=True), schemaString.split(" ")))
schema = StructType(fields)
rowRDD = line.map(lambda attributes: Row(attributes[0], attributes[1],attributes[2]))
peopleDF = spark.createDataFrame(rowRDD, schema)
peopleDF.createOrReplaceTempView("people")
results = spark.sql("SELECT * FROM people")
results.rdd.map(lambda attributes: "name: " + attributes[0] + "," + "age:" + attributes[1]).foreach(print)
# SQL风格语法
# personRdd_tmp.registerTempTable("person")
# spark.sql("select * from person where age >= 20 order by age desc limit 2").show()
#方法风格语法
# personRdd_tmp.select("name").show()
# personRdd_tmp.select(personRdd_tmp['name'], personRdd_tmp['age'] + 1).show()
# personRdd_tmp.filter(personRdd_tmp['age'] > 21).show()
# personRdd_tmp.groupBy("age").count().show()
# personRdd_tmp.createOrReplaceTempView("people")
# sqlDF = spark.sql("SELECT * FROM people")
# sqlDF.show()
# personRdd_tmp.createGlobalTempView("people")
# spark.sql("SELECT * FROM global_temp.people").show()
#
# spark.newSession().sql("SELECT * FROM global_temp.people").show()
# 保存为指定格式
# people = line.map(lambda p: (p[0],p[1], p[2].strip()))
# schemaString = "id name age"
#
# fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
# # # 通过StructType直接指定每个字段的schema
# schema = StructType(fields)
# schemaPeople = spark.createDataFrame(people, schema)
# schemaPeople.createOrReplaceTempView("people")
# results = spark.sql("SELECT * FROM people")
# results.write.json("D:\\code\\hadoop\\data\\spark\\day4\\personout.txt")
# results.write.save("D:\\code\\hadoop\\data\\spark\\day4\\personout1")
# results.show()
结果如下:
从mysql中读写数据
#JDBC
# sqlContext = SQLContext(sc)
# df = sqlContext.read.format("jdbc").options(url="jdbc:mysql://localhost:3306/hellospark",
# driver="com.mysql.jdbc.Driver", dbtable="(select * from actor) tmp",
# user="root", password="123456").load()
# schemaPeople.write \
# .format("jdbc") \
# .option("url", "jdbc:mysql://localhost:3306/hellospark")\
# .option("dbtable", "person")\
# .option("user", "root")\
# .option("password", "123456")\
# .mode("append")\
# .save()
需要指定warehouse_location,并将配置好的hive-site.xml放入$SPARK-HOME/conf目录下
from os.path import abspath
from pyspark.sql import SparkSession
# from pyspark.sql.types import StructField, StringType, StructType
if __name__ == "__main__":
# spark = SparkSession\
# .builder\
# .appName("PythonWordCount")\
# .master("spark://mini1:7077")\
# .getOrCreate()
# sc = spark.sparkContext
warehouse_location = abspath('spark-warehouse')
spark = SparkSession \
.builder \
.appName("Python Spark SQL Hive integration example") \
.config("spark.sql.warehouse.dir", warehouse_location) \
.enableHiveSupport() \
.getOrCreate()
# spark.sql("CREATE TABLE IF NOT EXISTS person(id STRING, name STRING,age STRING) row format delimited fields terminated by ' '")
# spark.sql("LOAD DATA INPATH 'hdfs://mini1:9000/person/data/person.txt' INTO TABLE person")
spark.sql("SELECT * FROM person").show()
# line = sc.textFile("hdfs://mini1:9000/person/data/person.txt").map(lambda x: x.split(' '))
# people = line.map(lambda p: (p[0],p[1], p[2].strip()))
# schemaString = "id name age"
#
# fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
# # # 通过StructType直接指定每个字段的schema
# schema = StructType(fields)
# schemaPeople = spark.createDataFrame(people, schema)
# schemaPeople.createOrReplaceTempView("people")
#
# schemaPeople.write \
# .format("jdbc") \
# .option("url", "jdbc:mysql://192.168.62.132:3306/hellospark")\
# .option("dbtable", "person")\
# .option("user", "root")\
# .option("password", "123456you")\
# .mode("append")\
# .save()
spark.stop()
# sc.stop()
还可以使用org.apache.spark.sql.hive.HiveContext来实现。