pyspark读取文件路径 和 文件

代码:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @author  : 何小义

import sys
reload(sys)
import nerspark
sys.setdefaultencoding('utf8')
import os
import json

# 本地spark (ps:要改成读者的spark路径)
os.environ['SPARK_HOME'] = "/usr/spark-2.0.1"
sys.path.append("/usr/spark-2.0.1/python")
sys.path.append("/usr/spark-2.0.1/python/bin")

try:
    from pyspark import SparkContext
    from pyspark import SparkConf
    from pyspark.sql import SparkSession
    from pyspark.sql import SQLContext
    from pyspark.sql import DataFrame
    from pyspark.sql import Row
    print("Successfully imported Spark Modules")
except ImportError as e:
    print("Can not import Spark Modules", e)
    sys.exit(1)
from pyspark.sql import SparkSession

# === spark读取文件 ====================================================

    # 配置spark(服务器)
    # spark = SparkSession.builder.master(服务器_URL).appName("hzy_test_script").getOrCreate()
    # 配置spark(本机)
    spark = SparkSession.builder.master('local').appName("hzy_test_script").getOrCreate()
    sc = spark.sparkContext

    # rdd_data = sc.textFile("xxx.t*xt")  # 文件内容
    rdd_data = sc.wholetextFile("xxx.t*xt") # 文件path + 内容
    result = rdd_data.take(10)
    print(result)

    print('完成...')

注:

1. data_rdd = sc.textFiles('xxxxxxx.txt')  # 读入文件内容,返回的东西是rdd

2. path_data_rdd = sc.wholeTextFile(('xxxxxxx.txt'))  # 不仅读入文件内容,还会读入文件的路径path

你可能感兴趣的:(大数据)