pyspark读csv

pyspark读取csv中的数据。
csv有header。header中有两列,列名为:bd,tt。

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

def run():
    spark = SparkSession \
        .builder \
        .appName("read_csv") \
        .getOrCreate() \
 \
        # 定义模式
    schema = StructType([StructField('bd', StringType(), True),
                         StructField('tt', StringType(), True)],
                        )

    df = spark.read.csv(r"map.csv", schema=schema, encoding='utf-8', header=True)  # header表示数据的第一行是否为列名,inferSchema表示自动推断schema,此时未指定schema
    df = df.select("bd", "tt")
    rows = df.collect()
    for row in rows:
        result[row['bd']] = row['tt'].split(";")
    analysis()


if __name__ == '__main__':
    run()

你可能感兴趣的:(python,spark)