为了用Spark支持Python,Apache Spark社区发布了一个工具PySpark。
PySpark在线安装:
pip install -U -i https://pypi.tuna.tsinghua.edu.cn/simple pyspark
PySpark离线安装:
下载地址:https://pypi.org/project/wheel/#files
wheel-0.33.6>python setup.py install
下载地址:https://pypi.org/project/pyspark/#files
pyspark-2.4.4>python setup.py install
Windows环境安装相关软件并配置相关环境变量
JAVA_HOME C:\Program Files\Java\jdk1.8.0_101
HADOOP_HOME F:\develop\hadoop\hadoop-2.10.0
SCALA_HOME F:\develop\scala\scala-2.12.8
SPARK_HOME F:\develop\spark\spark-2.4.4-bin-hadoop2.7
添加相关bin目录到PATH环境变量下
Hive配置文件hive-site.xml拷贝到$SPARK_HOME/conf目录下
mysql驱动jar包拷贝到$SPARK_HOME/jars目录下
示例:
# -*- coding:utf-8 -*-
import sys
from pyspark import SparkConf
from pyspark.sql import SparkSession
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
reload(sys)
sys.setdefaultencoding(default_encoding)
class BaseSparkSession(object):
SPARK_APP_NAME = None
SPARK_MASTER_URL = 'yarn'
SPARK_EXECUTOR_CORES = 2
SPARK_EXECUTOR_MEMORY = '2g'
SPARK_EXECUTOR_INSTANCES = 2
SPARK_YARN_QUEUE = None
ENABLE_HIVE_SUPPORT = False
def create_spark_session(self):
spark_conf = SparkConf()
spark_conf.setAll(
(
("spark.app.name", self.SPARK_APP_NAME),
("spark.master", self.SPARK_MASTER_URL),
("spark.executor.cores", self.SPARK_EXECUTOR_CORES),
("spark.executor.memory", self.SPARK_EXECUTOR_MEMORY),
("spark.executor.instances", self.SPARK_EXECUTOR_INSTANCES),
("spark.yarn.queue", self.SPARK_YARN_QUEUE)
)
)
if self.ENABLE_HIVE_SUPPORT:
return SparkSession.builder.config(conf=spark_conf).enableHiveSupport().getOrCreate()
else:
return SparkSession.builder.config(conf=spark_conf).getOrCreate()
# -*- coding -*-
import os
import sys
from spark import BaseSparkSession
from pyspark.sql.functions import col
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
reload(sys)
sys.setdefaultencoding(default_encoding)
os.environ['PYSPARK_PYTHON'] = 'F:\develop\python\Python27\python.exe'
os.environ['HADOOP_HOME'] = 'F:\develop\hadoop\hadoop-2.10.0'
os.environ['HADOOP_CONF_DIR'] = 'F:\develop\hadoop\hadoop-2.10.0-conf'
os.environ['SPARK_HOME'] = 'F:\develop\spark\spark-2.4.4-bin-hadoop2.7'
class SparkYarnHiveOp(BaseSparkSession):
SPARK_APP_NAME = "SparkYarnHiveOp"
SPARK_YARN_QUEUE = "queue3"
ENABLE_HIVE_SUPPORT = True
def __init__(self):
self.spark_session = self.create_spark_session()
class SparkLocalOp(BaseSparkSession):
SPARK_APP_NAME = "SparkLocalOp"
SPARK_MASTER_URL = "local[*]"
ENABLE_HIVE_SUPPORT = False
def __init__(self):
self.spark_session = self.create_spark_session()
class SparkLocalHiveOp(BaseSparkSession):
SPARK_APP_NAME = "SparkLocalHiveOp"
SPARK_MASTER_URL = "local[*]"
ENABLE_HIVE_SUPPORT = True
def __init__(self):
self.spark_session = self.create_spark_session()
if __name__ == '__main__':
spark_local_op = SparkLocalOp()
data = spark_local_op.spark_session.range(0, 100).select(col("count").cast("double"))
data.show()
data.agg({'count': 'sum'}).show()
spark_local_hive_op = SparkLocalHiveOp()
spark_local_hive_op.spark_session.sql('use default')
print spark_local_hive_op.spark_session.sql('select * from user')
spark_yarn_hive_op = SparkYarnHiveOp()
spark_yarn_hive_op.spark_session.sql('use default')
print spark_yarn_hive_op.spark_session.sql('select * from user')
异常处理记录:
SparkException: When running with master 'yarn' either HADOOP_CONF_DIR or YARN_CONF_DIR must be set in the environment.
os.environ['PYSPARK_PYTHON'] = 'F:\develop\python\Python27\python.ext'
os.environ['HADOOP_HOME'] = 'F:\develop\hadoop\hadoop-2.10.0'
os.environ['HADOOP_CONF_DIR'] = 'F:\develop\hadoop\hadoop-2.10.0-conf'
os.environ['SPARK_HOME'] = 'F:\develop\spark\spark-2.4.4-bin-hadoop2.7'
ClassNotFoundException: com.sun.jersey.api.client.config.ClientConfig
F:\develop\hadoop\hadoop-2.10.0\share\hadoop\yarn\lib目录下的
jersey-core-1.9.jar 和 jersey-client-1.9.jar 拷贝到
F:\develop\spark\spark-2.4.4-bin-hadoop2.7/jars目录下,删除旧jar包