1,打开: data=pd.read_csv('data_etr.csv',parse_dates=['DATA_DATE'],encoding='gbk')
data.head()
data=pd.read_excel('data/meal_order_detail.xlsx',sheet_name='meal_order_detail1)
2,透明表:pd.pivot_table(data=data,values='KWH',index='CONS_NO',columns='DATA_DATE')
(行列值)
3,堆叠:pd.concat([df1,fd2],axis=0,join=’inner’) #0是纵,1是横,‘inner’是交集,‘outer’是并集
4,记录重复:
data=pd.read_excel('data/meal_order_detail.xlsx', sheet_name='meal_order_detail1')
data['dishes_name'].drop_duplicates()#对菜品名称进行去重
data.drop_duplicates(subset=['order_id','emp_id']).shape #subset参数是去重的标准
data
5,特征重复:
#判定两个数是特定值是否相等
sim_dis=pd.DataFrame([],index=['counts','amounts','dishes_name'],columns=['counts','amounts','dishes_name'])
for i in ['counts','amounts','dishes_name']:
for j in ['counts','amounts','dishes_name']:
sim_dis.loc[i,j] = data[i].equals(data[j]) #equals方法是判定两个值是否相等
print(sim_dis)
6,检测与处理缺失值:
#利用isall或notnull找到缺失值
data.describe() #describe()只能对数值类型进行,类别型不行
data.isnull().sum() #当出现False时意思是‘不是缺失值’,反之,后面加sum()统计数量
data.notnull().sum()#当出现True时意思是‘不是缺失值’,反之,后面加sum()统计数量
7,发现缺失值后进行的处理
#1,删除法
import numpy as np
dict1 = {'ID':[1,2,3,4,5,6,7,8,9],
'System':['win10','win10',np.nan,'win10',np.nan,np.nan,'win7','win7','win8'],
'cpu':['i7','i5',np.nan,'i7',np.nan,np.nan,'i5','i5','i3'],
'pp':[1,3,2,4,5,6,7,8,9]}
a = pd.DataFrame(dict1)
a.dropna()#如果为缺失值就删除
a.dropna(subset=['cpu'])#subset是指定目标,例如如果是cup,就只看这一列,如果缺失值就删除
a.dropna(axis=1)#axis值为‘1’时删除特征(列),inplace参数是是否在本文件中进行
#2.替换法
a['System'].fillna('i7')
a['System'].fillna('i7').value_counts()#value_counts()统计个数
#3.插值法
#线性插值
x = np.array([1,2,3,6,7])
y = np.array([3,5,6,9,13])
from scipy.interpolate import interp1d
model = interp1d(x,y,kind='linear')
异常值
#箱线图
import matplotlib.pyplot as plt
p=plt.boxplot(data['counts'])
plt.show