Python Beginners(9) -- Clean Data

Read several csv into a dictionary

import pandas as pd
data_files = [
    "ap_2010.csv",
    "class_size.csv",
    "demographics.csv",
    "graduation.csv",
    "hs_directory.csv",
    "sat_results.csv"
]
data = {}
for item in data_files:
    path = 'schools/'+item
    keyname = item.split(".")[0]  # get the filename before '.'
    data[keyname] = pd.read_csv(path)

Good resource for encoding
reference
Read files with other encoding

all_survey = pd.read_csv("schools/survey_all.txt", delimiter = "\t", encoding = "windows-1252")
d75_survey = pd.read_csv("schools/survey_d75.txt", delimiter = "\t", encoding = "windows-1252")
survey = pd.concat([all_survey, d75_survey], axis = 0)  # new rows = row_x + row_y
print (survey.head(5))

Change column names and select a few columns from original

survey["DBN"] = survey["dbn"]
columns_to_keep = ["DBN", "rr_s", "rr_t", "rr_p", "N_s", "N_t", "N_p", "saf_p_11", "com_p_11", "eng_p_11", "aca_p_11", "saf_t_11", "com_t_11", "eng_t_11", "aca_t_11", "saf_s_11", "com_s_11", "eng_s_11", "aca_s_11", "saf_tot_11", "com_tot_11", "eng_tot_11", "aca_tot_11"]
survey = survey.loc[:, columns_to_keep]  # select a few columns
data["survey"] = survey

**apply function & lambda **
Create a new column called padded_csd, to turn original csd like '1' to '01', '3' to '03' while keep it as it is when len(csd) > 1.

data["class_size"]['padded_csd'] = data["class_size"]['CSD'].apply(lambda s: str(s).zfill(2)) #could use lambda here

data["class_size"]["DBN"] = data["class_size"]["padded_csd"] + data["class_size"]["SCHOOL CODE"]

Another way to do lambda -- apply function

def padding(num):
    a = str(num)
    if len(a) == 1:
        return a.zfill(2)
    else:
        return a
data["class_size"]['padded_csd'] = data["class_size"]['CSD'].apply(padding)

pd.to_numeric()

df["new_score"] = pd.to_numeric(df["score"], errors = "coerce")
df.loc[5, ["new_score"]] # print the first 5 rows

Group by

import numpy
class_size = class_size.groupby("DBN").agg(numpy.mean)
class_size.reset_index(inplace=True)
data["class_size"] = class_size
print(data["class_size"].head())

Select rows based on column values

temp = data["graduation"]
temp = temp.loc[(temp["Cohort"]== '2006') & (temp["Demographic"] == "Total Cohort")]
data["graduation"] = temp
print (data["graduation"].head(5))

merge

combined = combined.merge(data["class_size"], on = "DBN", how = "inner")

fillna

means = combined.mean()
combined = combined.fillna(means)
combined = combined.fillna(value = 0)
print (combined.head(5))

pd.corr()
Quickly to find the correlations between columns
see doc

correlations = combined.corr() # the r value
print (correlations["sat_score"])
# draw a bar chart for corr between sat_scores and other survey fields
combined.corr()["sat_score"][survey_fields].plot.bar()

df.plot

import matplotlib.pyplot as plt
combined.plot.scatter(x="total_enrollment", y="sat_score")

draw school locations with color indicates the values of another column

from mpl_toolkits.basemap import Basemap
m = Basemap(
    projection='merc', 
    llcrnrlat=40.496044, 
    urcrnrlat=40.915256, 
    llcrnrlon=-74.255735, 
    urcrnrlon=-73.700272,
    resolution='i'
)
m.drawmapboundary(fill_color='#85A6D9')
m.drawcoastlines(color='#6D5F47', linewidth=.4)
m.drawrivers(color='#6D5F47', linewidth=.4)

longitudes = combined["lon"].tolist()
latitudes = combined["lat"].tolist()
# show value of 'ell_percent' column
m.scatter(longitudes, latitudes, s=20, zorder=2, latlon=True,
         c = combined["ell_percent"], cmap= "summer")
plt.show()

Apply Function
for avenger data practice

def clean_deaths(row):
    num_deaths = 0
    columns = ['Death1', 'Death2', 'Death3', 'Death4', 'Death5']
 
    for c in columns:
        death = row[c]
        if pd.isnull(death) or death == 'NO':
            continue
        elif death == 'YES':
            num_deaths += 1
    return num_deaths

true_avengers['Deaths'] = true_avengers.apply(clean_deaths, axis=1)

OR could do

death_col = list()

death_col += ['Death' + str(i) for i in range(1,6)]

true_avengers['Deaths'] = (true_avengers[death_col] == 'YES').sum(axis = 1)

check if the column value is correct

joined_accuracy_count  = int()
correct_joined_years = true_avengers[true_avengers['Years since joining'] == (2015 - true_avengers['Year'])]
joined_accuracy_count = len(correct_joined_years)

你可能感兴趣的:(Python Beginners(9) -- Clean Data)