第1关 读CSV文件
第2关 清洗列名
第3关 清洗列名(续)
def clean_column_name(col_name):
col_name = col_name.strip()
col_name = col_name.replace('Operating System', 'OS')
col_name = col_name.replace(' ', '_')
col_name = col_name.replace('(', '').replace(')', '')
col_name = col_name.lower()
return col_name
col_name_list = []
for col_name in laptops.columns :
col = clean_column_name(col_name)
col_name_list.append(col)
laptops.columns = col_name_list
print(laptops.columns)
第4关 将字符串列转换为数值列
unique_ram = laptops['ram'].unique()
print(unique_ram)
第5关 删除非数字字符
laptops["ram"] = laptops["ram"].str.replace('GB','')
unique_ram = laptops['ram'].unique()
print(unique_ram)
第6关 将列转换为数字类型
laptops["ram"] = laptops["ram"].astype(int)
dtypes = laptops.dtypes
print(dtypes)
第7关 列的重命名
laptops.rename({"ram": "ram_gb"}, axis=1, inplace=True)
ram_gb_desc = laptops["ram_gb"].describe()
print(ram_gb_desc)
第8关 从字符串中提取数值
laptops["cpu_manufacturer"] = laptops["cpu"].str.split(n=1).str[0]
cpu_manufacturer_counts = laptops["cpu_manufacturer"].value_counts()
print(cpu_manufacturer_counts)
第9关 纠正错误值
laptops["os"] = laptops["os"].map(mapping_dict)
print(laptops["os"].value_counts())
第10关 删除缺失值
laptops_no_null_rows = laptops.dropna()
laptops_no_null_cols = laptops.dropna(axis=1)
print(laptops_no_null_rows.head(20))
print(laptops_no_null_cols.head(20))
第11关 填充缺失值
value_counts_before = laptops.loc[laptops["os_version"].isnull(), "os"].value_counts()
laptops.loc[laptops["os"] == "No OS", "os_version"] = "Version Unknown"
laptops.loc[laptops["os"] == "macOS", "os_version"] = "X"
value_counts_after = laptops.loc[laptops["os_version"].isnull(), "os"].value_counts()
print(value_counts_before)
print(value_counts_after)
第12关 挑战:对字符串列进行清洗
laptops["weight"] = laptops["weight"].str.replace("kgs","")
laptops["weight"] = pd.to_numeric(laptops["weight"].str.replace("kg",""))
laptops.rename({"weight":"weight_kg" }, axis=1, inplace=True)
laptops.to_csv("laptops_clean.csv", index=False)
print(laptops.columns)