包安装
pip install pyspark
SparkContext是PySpark程序运行入口,首先构建SparkContext对象
from pyspark import SparkConf, SparkContext
# 构建SparkConf对象实例
conf = SparkConf().setMaster("local[*]").setAppName("test_spark_app")
# 构建SparkContext
sc = SparkContext(conf=conf)
# 打印版本
print(sc.version)
# 关闭pyspark程序
sc.stop()
from pyspark import SparkConf, SparkContext
# 构建SparkConf对象实例
conf = SparkConf().setMaster("local[*]").setAppName("test_spark_app")
# 构建SparkContext
sc = SparkContext(conf=conf)
# 将python容器数据转为rdd,列表,元祖,集合,字典,字符串等均可
rdd1 = sc.parallelize([1, 2, 3, 4, 5])
# 查看rdd对象包含的数据
print(rdd1.collect())
# 读取文件转为rdd对象
rdd2 = sc.textFile("D:/file.txt")
print(rdd2.collect())
# 关闭pyspark程序
sc.stop()
from pyspark import SparkConf, SparkContext
import os
# 设置环境变量,让pyspark准确找到python解释器
os.environ["PYSPARK_PYTHON"] = "D:/python/python-3.10.9/python.exe"
# 构建SparkConf对象实例
conf = SparkConf().setMaster("local[*]").setAppName("test_spark_app")
# 构建SparkContext
sc = SparkContext(conf=conf)
# 将python容器数据转为rdd,列表,元祖,集合,字典,字符串等均可
rdd = sc.parallelize([1, 2, 3, 4, 5])
# 调用map算子
rdd1 = rdd.map(lambda x: x * 10)
# 打印map算子调用后的数据
print(rdd1.collect())
# 关闭pyspark程序
sc.stop()
输出结果
23/03/12 20:35:29 WARN Shell: Did not find winutils.exe: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset. -see https://wiki.apache.org/hadoop/WindowsProblems
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/03/12 20:35:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[10, 20, 30, 40, 50]
如果报错IndexError: tuple index out of range,可能是python版本导致的,我用3.11.1报了这个错,将python版本修改为3.10.9目前来看没问题。(下载对应python版本,修改环境变量、pycharm中解释器配置,以及代码中的解释器地址,重新pip install pyspark)
列表1:
[[1, 2],[3, 4]]
接触嵌套后为
[1, 2, 3, 4]
from pyspark import SparkConf, SparkContext
import os
# 设置环境变量,让pyspark准确找到python解释器
os.environ["PYSPARK_PYTHON"] = "D:/python/python-3.10.9/python.exe"
# 构建SparkConf对象实例
conf = SparkConf().setMaster("local[*]").setAppName("test_spark_app")
# 构建SparkContext
sc = SparkContext(conf=conf)
# 将python容器数据转为rdd,列表,元祖,集合,字典,字符串等均可
rdd = sc.parallelize(["hello world", "pyhton great"])
# 调用map算子
rdd1 = rdd.flatMap(lambda x: x.split(" "))
# 打印map算子调用后的数据
print(rdd1.collect())
# 关闭pyspark程序
sc.stop()
执行结果为:
['hello', 'world', 'pyhton', 'great']
from pyspark import SparkConf, SparkContext
import os
# 设置环境变量,让pyspark准确找到python解释器
os.environ["PYSPARK_PYTHON"] = "D:/python/python-3.10.9/python.exe"
# 构建SparkConf对象实例
conf = SparkConf().setMaster("local[*]").setAppName("test_spark_app")
# 构建SparkContext
sc = SparkContext(conf=conf)
# 将python容器数据转为rdd,列表,元祖,集合,字典,字符串等均可
rdd = sc.parallelize([('男', 99), ('男', 88), ('女', 100), ('女', 99)])
# 调用map算子
rdd1 = rdd.reduceByKey(lambda x, y: x + y)
# 打印map算子调用后的数据
print(rdd1.collect())
# 关闭pyspark程序
sc.stop()
执行结果
[('男', 187), ('女', 199)]
from pyspark import SparkConf, SparkContext
import os
# 设置环境变量,让pyspark准确找到python解释器
os.environ["PYSPARK_PYTHON"] = "D:/python/python-3.10.9/python.exe"
# 构建SparkConf对象实例
conf = SparkConf().setMaster("local[*]").setAppName("test_spark_app")
# 构建SparkContext
sc = SparkContext(conf=conf)
# 将python容器数据转为rdd,列表,元祖,集合,字典,字符串等均可
rdd = sc.parallelize([1, 2, 3, 4, 5])
# 调用map算子
rdd1 = rdd.filter(lambda x: x > 2)
# 打印map算子调用后的数据
print(rdd1.collect())
# 关闭pyspark程序
sc.stop()
执行结果
[3, 4, 5]
from pyspark import SparkConf, SparkContext
import os
# 设置环境变量,让pyspark准确找到python解释器
os.environ["PYSPARK_PYTHON"] = "D:/python/python-3.10.9/python.exe"
# 构建SparkConf对象实例
conf = SparkConf().setMaster("local[*]").setAppName("test_spark_app")
# 构建SparkContext
sc = SparkContext(conf=conf)
# 将python容器数据转为rdd,列表,元祖,集合,字典,字符串等均可
rdd = sc.parallelize([1, 1, 3, 3, 5])
# 调用map算子
rdd1 = rdd.distinct()
# 打印map算子调用后的数据
print(rdd1.collect())
# 关闭pyspark程序
sc.stop()
执行结果
[1, 5, 3]
from pyspark import SparkConf, SparkContext
import os
# 设置环境变量,让pyspark准确找到python解释器
os.environ["PYSPARK_PYTHON"] = "D:/python/python-3.10.9/python.exe"
# 构建SparkConf对象实例
conf = SparkConf().setMaster("local[*]").setAppName("test_spark_app")
# 构建SparkContext
sc = SparkContext(conf=conf)
# 将python容器数据转为rdd,列表,元祖,集合,字典,字符串等均可
rdd = sc.parallelize([("男", 89), ("女", 98), ("不明", 50)])
# 调用map算子
rdd1 = rdd.sortBy(lambda x: x[1], ascending=False, numPartitions=1)
# 打印map算子调用后的数据
print(rdd1.collect())
# 关闭pyspark程序
sc.stop()
执行结果
[('女', 98), ('男', 89), ('不明', 50)]
今天结束,每天进步那一点点!!!!!!