import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore') #忽略警告
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
url = r'./data/Pokemon.csv'
data = pd.DataFrame(pd.read_csv(url)) #打开文件,读取数据
# 输出前五行数据
print('前五行数据:')
print(data.head(5))
# 输出最后三行数据
print('后三行数据')
print(data.tail(3))
data = data.iloc[:-2]
# 查看列名
print('列名:',data.columns)
# 查看行数与列数
print('行数与列数:',data.shape)
# 展示index,datatype和memory相关信息
print(data.info())
raw, column = data.shape
if None:
num = data[col].isnull().sum().sort_values()
else:
num = data.isnull().sum().sort_values()
print(None, r'缺失数')
print(num)
print(None, r'缺失比例')
print(num/raw)
data = data.fillna("null")
主要针对#
列
# 检查 'id' 列是否有重复值
duplicate_ids = data.duplicated('#')
# 获取所有具有重复 id 的行
duplicate_rows = data[duplicate_ids]
# 打印具有重复 id 的行
print("具有重复 id 的行:")
print(duplicate_rows)
#
重复的妖怪只保留第一条记录:data = data.drop_duplicates(['#'],keep='first')
data['Attack'] = data['Attack'].astype(float)
data['Defense'] = data['Defense'].astype(float)
data['Sp. Atk'] = data['Sp. Atk'].astype(float)
data['Sp. Def'] = data['Sp. Def'].astype(float)
data['Speed'] = data['Speed'].astype(float)
data.boxplot(column=['Attack','Defense','Sp. Atk','Sp. Def','Speed'])
plt.show()
检查 Generation
列是否存在非数字值
non_numeric_generation = pd.to_numeric(data['Generation'], errors='coerce').isna()
print(data[non_numeric_generation])
data = data[~non_numeric_generation]