用spark分析北京积分落户数据,整理北京积分落户名单数据成csv格式

读取json文件格式数据,整理导出成csv格式

import json,csv

#加载数据
def loadData():
    with open('jifenluohu.json', 'r') as f:
        data = json.load(f)
        rows = data['rows']
    with open("jifenluohu.csv", "w") as f:
        fieldnames = ["pxid", "id", "idCard", "name", "score", "unit", "ranking"]    # 表的列名
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()    # 加上表头
        for row in rows:
            newrow = {"pxid":row["pxid"],"id":row["id"],"idCard":row["idCard"],"name":row["name"],"score":row["score"],"unit":row["unit"],"ranking":row["ranking"]}
            writer.writerow(newrow)
        print("写csv完成")
 
t = loadData()

print(t)

后面有增加了年龄,生肖,年龄,省份,城市等属性。为后续进一步分析做准备。

import json,csv
from datetime import datetime

#根据出生年份获取生肖
def chinese_zodiac(year):  
    return u'猴鸡狗猪鼠牛虎兔龙蛇马羊'[year%12]  
    
# 根据出生日期获取星座
def get_constellation(month, date):
    dates = (21, 20, 21, 21, 22, 22, 23, 24, 24, 24, 23, 22)
    constellations = ("摩羯", "水瓶", "双鱼", "白羊", "金牛", "双子", "巨蟹", "狮子", "处女", "天秤", "天蝎", "射手", "摩羯")
    if date < dates[month-1]:
        return constellations[month-1]
    else:
        return constellations[month]  

    
#city
def citydict():
    with open("city.csv") as file:
        citys = {}
        for line in  file:
            if line==",":
                continue
            city = line.split(",")
            citys.update({city[0]:city[1].replace("\n", "")})
        return citys


#加载数据
def loadData():
    with open('jifenluohu.json', 'r') as f:
        data = json.load(f)
        rows = data['rows']
    with open("jifenluohu.csv", "w") as f:
        fieldnames = ["pxid", "id", "idCard", "name", "score", "unit", "ranking", "province", "city", "provincename", "cityname", "birthday", "age", "zoo", "star"]    # 表的列名
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()    # 加上表头
        citys = citydict()
        for row in rows:
            idCard = row["idCard"]
            province = idCard[0:2]
            city = idCard[0:6]
            year = idCard[6:10]
            month = idCard[10:12]
            day = idCard[12:14]
            zoo = chinese_zodiac(int(year))
            star = get_constellation(int(month), int(day))
            provincename = citys.get(province)
            cityname = citys.get(city)
            birthday = year+'-'+month+'-'+day
            age = 2018-int(year)
            ext = {'province':province, 'city':city, 'provincename':provincename, 'cityname':cityname, 'birthday':birthday, 'age':age, 'zoo':zoo, 'star':star}
            #print(ext)
            newrow = {"pxid":row["pxid"],"id":row["id"],"idCard":row["idCard"],"name":row["name"],"score":row["score"],"unit":row["unit"].strip(),"ranking":row["ranking"]}
            newrow.update(ext)
            writer.writerow(newrow)
        print("写csv完成")


t = loadData()
#t = citydict()
print(t)

资料包,以及用pyspark分析过程下载

https://download.csdn.net/download/huoyongliang/10723220

百度云

https://pan.baidu.com/s/1XyoyO3AgkVwVRRBnGZq2Gg

你可能感兴趣的:(Spark)