import pandas as pd
from scipy.io import arff
data,meta = arff.loadarff("../Dry_Bean_Dataset/Dry_Bean_Dataset.arff")
df =pd.DataFrame(data)
print(df.shape)
print(df.head())
print(df.isnull().sum())
print(df.describe().T)
print(df.Class.unique())
from sklearn.preprocessing import LabelEncoder
lb_encoder = LabelEncoder()
for col in df.columns:
df.Class = lb_encoder.fit_transform(df.Class)
print(df.Class.value_counts())
import seaborn as sns
import matplotlib.pyplot as plt
sns.countplot(x=df.Class)
plt.title('Class')
plt.show()
DataDF.isnull().sum().sort_values(ascending=False)
'''移除行或列"'''
drop_row_data = train_data.dropna() # 移除缺值的行
drop_col_data = train_data.dropna(axis=1) # 移除至少缺一个值的列
均值
all_features.Embarked = all_features.Embarked.fillna(all_features.Embarked.mean())
最近邻,前后,在时间序列分析中比较常见
print(DataDF.UnitPrice.fillna(method='ffill')) # 前向后填充
print(DataDF.UnitPrice.fillna(method='bfill')) # 后向前填充
sns.boxplot(x = train_data["Pclass"], y = train_data["Age"])plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-29wUvpiF-1647510577772)(【数据处理】:(二)文本类/1647438088486.png)]
'''
通过数据可视化,发现Age与Pclass相关性较大
'''
def add_age(cols):
Age=cols[0]
Pclass=cols[1]
if pd.isnull(Age):
if Pclass==1:
return train_data[train_data['Pclass']==1]['Age'].mean()
elif Pclass==2:
return train_data[train_data['Pclass']==2]['Age'].mean()
elif Pclass==3:
return train_data[train_data['Pclass']==3]['Age'].mean()
else:
return Age
train_data['Age']=train_data[['Age','Pclass']].apply(add_age,axis=1)
df_data['Title'] = df_data.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
pd.crosstab(df_data['Title'], df_data['Sex']).transpose()
Title | Capt | Col | Countess | Don | Dona | Dr | Jonkheer | Lady | Major | Master | Miss | Mlle | Mme | Mr | Mrs | Ms | Rev | Sir |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Sex | ||||||||||||||||||
female | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 260 | 2 | 1 | 0 | 187 | 2 | 0 | 0 |
male | 1 | 4 | 0 | 1 | 0 | 7 | 1 | 0 | 2 | 61 | 0 | 0 | 0 | 757 | 0 | 0 | 8 | 1 |
"""
根据统计结果填补Age
"""
train_data['Initial']=0
for i in train_data:
train_data['Initial']=train_data.Name.str.extract('([A-Za-z]+)\.')
pd.crosstab(train_data.Initial,train_data.Sex).T.style.background_gradient()
train_data['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'],inplace=True)
train_data.loc[(train_data.Age.isnull())&(train_data.Initial=='Mr'),'Age']=33
train_data.loc[(train_data.Age.isnull())&(train_data.Initial=='Mrs'),'Age']=36
train_data.loc[(train_data.Age.isnull())&(train_data.Initial=='Master'),'Age']=5
train_data.loc[(train_data.Age.isnull())&(train_data.Initial=='Miss'),'Age']=22
train_data.loc[(train_data.Age.isnull())&(train_data.Initial=='Other'),'Age']=46
from sklearn.preprocessing import LabelEncoder
lb_encoder = LabelEncoder()
for col in df.columns:
df.Class = lb_encoder.fit_transform(df.Class)
print(df.Class.value_counts())
若该属性只有几类,可直接替换
train_data.Embarked=train_data.Embarked.map({'S':0, 'C':1, 'Q':2})
或
all_features.Sex.replace('male', 1, inplace=True)
all_features.Sex.replace('female', 0, inplace=True)
all_features.Embarked.replace('S', 0, inplace=True)
all_features.Embarked.replace('C', 1, inplace=True)
all_features.Embarked.replace('Q', 2, inplace=True)
在缩放中更改数据的范围,
而在规范化方面,更改数据分布的形状。
original_goal_data = pd.DataFrame(kickstarters_2017.goal)
scaled_goal_data = minmax_scaling(original_goal_data,columns=["goal"])
original_pledged = pd.DataFrame(kickstarters_2017.pledged)
index_positive_pledges = kickstarters_2017.pledged > 0
positive_pledges_only = kickstarters_2017.pledged.loc[index_positive_pledges]
normalized_values = pd.Series(stats.boxcox(positive_pledges_only)[0],
name='pledged', index=positive_pledges_only.index)
landslides['date_parsed'] = pd.to_datetime(landslides['date'], format="%m/%d/%Y")
注意!format中%Y尽量用大写
day_of_month_earthquakes = earthquakes['date_parsed'].dt.day
对于同一个属性的同一个值,由于输入时空格,大小写等原因,造成不一致的问题
countries = professors['Country'].unique()
professors['Country'] = professors['Country'].str.lower()
professors['Country'] = professors['Country'].str.strip()
Python strip() 方法用于移除字符串头尾指定的字符(默认为空格或换行符)或字符序列。
Python lower() 方法转换字符串中所有大写字符为小写。
matches = fuzzywuzzy.process.extract("usa", countries, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)
replace_matches_in_column(df=professors, column='Country', string_to_match="usa", min_ratio=70)
def replace_matches_in_column(df, column, string_to_match, min_ratio = 47):
strings = df[column].unique()
matches = fuzzywuzzy.process.extract(string_to_match, strings, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)
close_matches = [matches[0] for matches in matches if matches[1] >= min_ratio]
rows_with_matches = df[column].isin(close_matches)
df.loc[rows_with_matches, column] = string_to_match