Spark的坑--Spark新手必看--Python Spark必读,耗费了我近三周的时间

from pyspark import SparkContext, SparkConf
import os
from tqdm import tqdm

from utils2 import convert_date_2_chenyings_format

os.environ['HADOOP_HOME'] = 'D:\software\spark\winutils\hadoop-common-2.2.0-bin'  # winutils.exe,hadoop的坑

os.environ["PYSPARK_PYTHON"] = "/Users/user/Python_Source/venv/python3.7"  # 集群上期望运行的Python的版本

conf = SparkConf().setAppName('AppName').setMaster('spark://0.0.0.0:7077')  # 局域网IP
sc = SparkContext(conf=conf)

# 把本地的.py文件导入集群
for f in tqdm(os.listdir()):
    if f.find('.py') != -1:
        sc.addPyFile(f)

# 测一下
data = [1, 2, 3, 4, 5]
data2 = {1: '1s', 2: '2s', 3: '3s'}
distData = sc.parallelize(data)
distData.collect()

date_test = distData.map(lambda k: k + 1)
date_test.collect()

date_test = distData.map(lambda k: k)  # 可以写你自己的Py文件
date_test.collect()

sc.stop()

你可能感兴趣的:(Python)