关于Python中pyspark的使用

pyspark数据的输入

from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local[*]").setAppName("test_spark")

sc = SparkContext(conf=conf)

rdd1 = sc.parallelize([1, 2, 3, 4, 5])
rdd2 = sc.parallelize((1, 2, 3, 4, 5))
rdd3 = sc.parallelize("asdcascacasc")
rdd4 = sc.parallelize({1, 2, 3, 4, 5})
rdd5 = sc.parallelize({"好啊": "value1", "key2": "value2"})
rdd6 = sc.textFile("D:/桌面/资料内容/资料/hello.txt")

print(rdd1.collect())
print(rdd2.collect())
print(rdd3.collect())
print(rdd4.collect())
print(rdd5.collect())
print(rdd6.collect())

sc.stop()

关于Python中pyspark的使用_第1张图片

map方法的使用:map(传入一个函数)

from pyspark import SparkConf, SparkContext
import os

os.environ['PYSPARK_PYTHON'] = "D:/Python/Anacond/python.exe"

conf = SparkConf().setMaster("local[*]").setAppName("test_spark")
sc = SparkContext(conf=conf)

rdd = sc.parallelize([1, 2, 3, 4, 5])


# 把rdd中的每个元素乘以10
def func(data):
    return data * 10


# rdd2 = rdd.map(func)
# lambda函数使用
rdd2 = rdd.map(lambda x: x * 10).map(lambda x: x + 5)

# [10, 20, 30, 40, 50]
print(rdd2.collect())

sc.stop()

flatMap(传入函数)方法:和map基本一样,但是flatmap可以用于解除嵌套

from pyspark import SparkConf, SparkContext
import os

os.environ['PYSPARK_PYTHON'] = "D:/Python/Anacond/python.exe"
conf = SparkConf().setMaster("local[*]").setAppName("test_spark")
sc = SparkContext(conf=conf)

rdd = sc.parallelize(["itheima pjb 666", "sadijad dsakj asdkj", "python nihaoa"])

# 结果是 [['itheima', 'pjb', '666'], ['sadijad', 'dsakj', 'asdkj'], ['python', 'nihaoa']]
rdd2 = rdd.map(lambda x: x.split(" "))

# 结果是['itheima', 'pjb', '666', 'sadijad', 'dsakj', 'asdkj', 'python', 'nihaoa']
rdd2 = rdd.flatMap(lambda x: x.split(" "))
print(rdd2.collect())

reduceBykey 针对KV型RDD,自动按照key分组,然后根据你提供的逻辑,完成组内数据(value)的聚合操作

from pyspark import SparkConf, SparkContext
import os

os.environ['PYSPARK_PYTHON'] = "D:/Python/Anacond/python.exe"
conf = SparkConf().setMaster("local[*]").setAppName("test_spark")
sc = SparkContext(conf=conf)

rdd = sc.parallelize([('男', 99), ('男', 69), ('女', 49), ('女', 29)])

# 结果[('男', 168), ('女', 78)]
rdd2 = rdd.reduceByKey(lambda a, b: a + b)
print(rdd2.collect())

综合案例:统计单词数量

from pyspark import SparkConf, SparkContext
import os

os.environ['PYSPARK_PYTHON'] = "D:/Python/Anacond/python.exe"
conf = SparkConf().setMaster("local[*]").setAppName("test_spark")
sc = SparkContext(conf=conf)

rdd = sc.textFile("D:/桌面/资料内容/资料/text.txt")
# 取出全部单词
rdd2 = rdd.flatMap(lambda x: x.split(" "))
# 将全部单词百年未2元元组,单词为key,value为1
word_rdd = rdd2.map(lambda word: (word, 1))
# 分组并求和
result = word_rdd.reduceByKey(lambda a, b: a + b)

print(result.collect())

Filter过滤器的使用:参数类型是True或者False,满足则放行,不满足则过滤

from pyspark import SparkConf, SparkContext
import os

os.environ['PYSPARK_PYTHON'] = "D:/Python/Anacond/python.exe"
conf = SparkConf().setMaster("local[*]").setAppName("test_spark")
sc = SparkContext(conf=conf)

rdd = sc.parallelize([1, 2, 3, 4, 5])

# 满族条件的就输出,不满足就不输出
rdd2 = rdd.filter(lambda num: num % 2 == 0)

# 结果[2, 4]
print(rdd2.collect())

Distinct方法:去除重复元素,不需要填入参数

from pyspark import SparkConf, SparkContext
import os

os.environ['PYSPARK_PYTHON'] = "D:/Python/Anacond/python.exe"
conf = SparkConf().setMaster("local[*]").setAppName("test_spark")
sc = SparkContext(conf=conf)

rdd = sc.parallelize([1, 1, 3, 4, 5, 6, 7, 7, 8, 9, 9])

# 去重操作
rdd2 = rdd.distinct()

# 结果:[8, 1, 9, 3, 4, 5, 6, 7]
print(rdd2.collect())

sortBy:对数据进行排序操作

from pyspark import SparkConf, SparkContext
import os

os.environ['PYSPARK_PYTHON'] = "D:/Python/Anacond/python.exe"
conf = SparkConf().setMaster("local[*]").setAppName("test_spark")
sc = SparkContext(conf=conf)

rdd = sc.textFile("D:/桌面/资料内容/资料/text.txt")

# 取出全部单词
rdd2 = rdd.flatMap(lambda x: x.split(" "))
# 将全部单词变为2元元组,单词为key,value为1
word_rdd = rdd2.map(lambda word: (word, 1))
# 分组并求和
result = word_rdd.reduceByKey(lambda a, b: a + b)
# [('pjb', 3), ('xxx', 5), ('', 1), ('ddd', 2), ('uuuuuuu', 2)]
print(result.collect())

# sortBy(函数,ascending=True/False,如果为False则是从大到小排序,为True则是从小到大;numPartitions=1)
final = result.sortBy(lambda x: x[1], ascending=False, numPartitions=1)
# [('xxx', 5), ('pjb', 3), ('ddd', 2), ('uuuuuuu', 2), ('', 1)]
print(final.collect())

综合案例2

from pyspark import SparkConf, SparkContext
import os
import json

os.environ['PYSPARK_PYTHON'] = "D:/Python/Anacond/python.exe"
conf = SparkConf().setMaster("local[*]").setAppName("test_spark")
sc = SparkContext(conf=conf)

file_rdd = sc.textFile("xxxx")
# 取出Json字符串
json_str_rdd = file_rdd.flatMap(lambda x: x.split("|"))
# 转化为字典类型
dict_rdd = json_str_rdd.map(lambda x: json.loads(x))
# 取出城市及销售额
city_with_money_rdd = dict_rdd.map(lambda x: (x['areaName', int(x['money'])]))
# 按城市名字聚合
city_result_rdd = city_with_money_rdd.reduceByKey(lambda a, b: a + b)
# 按销售额聚合结果进行排序
city_result_rdd.sortBy(lambda x: x[1], ascending=False, numPartitions=1)

# 取出全部的商品类别
category_rdd = dict_rdd.map(lambda x: x['category']).distinct()


输出为python对象:collet,reduce,take,count

from pyspark import SparkConf, SparkContext
import os

os.environ['PYSPARK_PYTHON'] = "D:/Python/Anacond/python.exe"
conf = SparkConf().setMaster("local[*]").setAppName("test_spark")
sc = SparkContext(conf=conf)

rdd = sc.parallelize([1, 2, 3, 4, 5])

# collect算子,输出RDD为list对象
rdd_list: list = rdd.collect()
print(rdd_list)  # [1, 2, 3, 4, 5]
print(type(rdd_list))

# reduce算子,对RDD进行两两聚合
num = rdd.reduce(lambda a, b: a + b)
print(num)  # 15

# take算子,取出RDD钱N个元素,组成list返回
take_list = rdd.take(3)
print(take_list)  # [1, 2, 3]

# count,统计rdd内有多少个元素
num = rdd.count()
print(num)  # 5

数据的输出:输出到文件中

from pyspark import SparkConf, SparkContext
import os

os.environ['PYSPARK_PYTHON'] = "D:/Python/Anacond/python.exe"
os.environ['HADOOP_HOME'] = "D:/桌面/第15章资料/资料/hadoop-3.0.0.tar/hadoop-3.0.0/hadoop-3.0.0"
conf = SparkConf().setMaster("local[*]").setAppName("test_spark")
conf.set("spark.default.parallelism", "1")  # 方法1:全局输出的文件数为1,不然输出的结果为内核个数文件数
sc = SparkContext(conf=conf)

# 方法2:在后面这个参数写上1,或者写上numSlices=1
rdd1 = sc.parallelize([1, 2, 3, 4, 5], numSlices=1)

rdd2 = sc.parallelize([("Hello", 3), ("Spark", 5), ("Hi", 7)], 1)

rdd3 = sc.parallelize([[1, 3, 5], [56, 7, 9], [11, 13, 11]], 1)

# 输出到文件中
rdd1.saveAsTextFile("D:/桌面/资料内容")
rdd2.saveAsTextFile("D:/桌面/资料内容")
rdd3.saveAsTextFile("D:/桌面/资料内容")

你可能感兴趣的:(python,大数据,数据挖掘)