构建RDD对象的方式主要有二种
1- 通过parallelized Collections构建RDD: 并行本地集合方式 (测试)
2- 通过 External Data构建RDD: 加载外部文件的方式 (测试/开发)
通过parallelized Collections构建RDD
from pyspark import SparkContext, SparkConf
import os
# 锁定远端环境, 确保环境统一
os.environ['SPARK_HOME'] = '/export/server/spark'
os.environ['PYSPARK_PYTHON'] = '/root/anaconda3/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/root/anaconda3/bin/python3'
if __name__ == '__main__':
print("如何构建RDD方式一: 并行本地集合")
# 1. 创建SparkContext核心对象
conf = SparkConf().setAppName("create_rdd_01").setMaster("local[2]")
sc = SparkContext(conf=conf)
# 2. 读取数据集: 本地集合
rdd_init = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10],3)
# 3. 打印结果数据
print(rdd_init.collect())
print(rdd_init.getNumPartitions()) # 获取这个RDD有多少个分区
print(rdd_init.glom().collect()) # 获取每个分区中的数据
# 4- 释放资源
sc.stop()
通过 External Data构建RDD
from pyspark import SparkContext, SparkConf
import os
# 锁定远端环境, 确保环境统一
os.environ['SPARK_HOME'] = '/export/server/spark'
os.environ['PYSPARK_PYTHON'] = '/root/anaconda3/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/root/anaconda3/bin/python3'
if __name__ == '__main__':
print("如何构建RDD方式二: 读取外部数据集")
# 1. 创建SparkContext核心对象
conf = SparkConf().setAppName("create_rdd_02").setMaster("local[*]")
sc = SparkContext(conf=conf)
# 2. 读取数据集:
rdd_init = sc.textFile("file:///export/data/workspace/ky06_pyspark/_02_SparkCore/data/")
# 3. 打印结果
print(rdd_init.collect())
print(rdd_init.getNumPartitions())
print(rdd_init.glom().collect())
"""
[
[
'hadoop hive hive hadoop sqoop',
'sqoop kafka hadoop sqoop hive hive',
'hadoop hadoop hive sqoop kafka kafka'
],
[
'kafka hue kafka hbase hue hadoop hadoop hive',
'sqoop sqoop kafka hue hue kafka'
]
]
[
['hadoop hive hive hadoop sqoop', 'sqoop kafka hadoop sqoop hive hive'],
['hadoop hadoop hive sqoop kafka kafka'],
['kafka hue kafka hbase hue hadoop hadoop hive'],
[],
['sqoop sqoop kafka hue hue kafka']]
"""
# 4- 释放资源
sc.stop()
# 博学谷IT 技术支持