一个spark清洗数据的demo

import sys

reload(sys)
sys.setdefaultencoding('utf8')

import re
import json

from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, StructField, StructType
import copy


master_url = 'spark://sc-bd-10:7077'

spark = SparkSession.builder \
    .master(master_url) \
    .appName("saic_huangyu") \
    .getOrCreate()

spark.conf.set("spark.driver.maxResultSize", "4g")
spark.conf.set("spark.sql.broadcastTimeout", 1200)
spark.conf.set("spark.sql.crossJoin.enabled", "true")

spark.sparkContext.addPyFile("md5_eid_pid.py")
from md5_eid_pid import gen_md5_pid


person_inv_company_without_pid_list = ["eid_merged", "share_name", "share_type", "inv_conum", "con_date"]
person_inv_company_without_pid_list_schema = StructType([StructField(field_name, StringType(), True) for field_name in person_inv_company_without_pid_list])
df_person_inv_company_without_pid = spark.read.load("hdfs://sc-bd-10:9000/scdata/huangyu/result/person_inv_company_table_person_without_pid_compliment_without_pid.csv", format="csv", schema=person_inv_company_without_pid_list_schema, delimiter=',')
df_person_inv_company_without_pid.createOrReplaceTempView("person_inv_company_table_person_without_pid_compliment_without_pid")


person_name_without_pid_list = ["eid_merged", "person_name", "is_fr", "position"]
person_name_without_pid_schema = StructType([StructField(field_name, StringType(), True) for field_name in person_name_without_pid_list])
df_person_name_without_pid = spark.read.load("hdfs://sc-bd-10:9000/scdata/huangyu/result/person_position_company_table_person_without_pid_compliment_without_pid.csv", format="csv", schema=person_name_without_pid_schema, delimiter=',')
df_person_name_without_pid.createOrReplaceTempView("person_position_company_table_person_without_pid_compliment_without_pid")


merged_eid_table_list = ["eid_merged", "eid_new", "name"]
merged_eid_table_schema = StructType([StructField(field_name, StringType(), True) for field_name in merged_eid_table_list])
df_merged_eid_table = spark.read.load("hdfs://sc-bd-10:9000/scdata/huangyu/result/merge_new_old_table.csv", format="csv", schema=merged_eid_table_schema, delimiter=',')
df_merged_eid_table.createOrReplaceTempView("merged_eid_table")


pid_eid_table_list = ["eid_merged", "pid", "person_name"]
pid_eid_table_schema = StructType([StructField(field_name, StringType(), True) for field_name in pid_eid_table_list])


spark.sql("""
select t1.eid_merged, t2.name as company, t1.person_name 
from (
    select eid_merged, share_name as person_name
    from person_inv_company_table_person_without_pid_compliment_without_pid
    union 
    select eid_merged, person_name  
    from person_position_company_table_person_without_pid_compliment_without_pid
) t1
left join merged_eid_table t2
on t1.eid_merged=t2.eid_merged
where t1.eid_merged is not null and t2.eid_merged is not null 
""")\
    .rdd\
    .map(lambda _: Row(
    eid_merged=_["eid_merged"],
    pid=gen_md5_pid(_["company"]+_["person_name"]),
    person_name=_["person_name"],
))\
    .toDF(pid_eid_table_schema)\
    .write\
    .save("hdfs://sc-bd-10:9000/scdata/huangyu/result/person_pid_gen_new_all_field.csv", format="csv", header=False, delimiter=',', mode="overwrite")


spark.stop()

你可能感兴趣的:(python,spark海量数据分析)