import numpy as np
import pandas as pd
from sqlalchemy import create_engine
conn = create_engine('mysql+pymysql://root:[email protected]:3306/chat?charset=utf8')
detail1 = pd.read_sql('meal_order_detail1',conn)
df1 = detail1.iloc[:,:10] ##取出detail1的前10列数据
df2 = detail1.iloc[:,10:] ##取出detail1的后9列数据
print('内连接合并后的数据框大小为:',pd.concat([df1,df2],axis=1,join='inner').shape)
print('外连接合并后的数据框大小为:',pd.concat([df1,df2],axis=1,join='outer').shape)
df3 = detail1.iloc[:1500,:] ##取出detail1前1500行数据
df4 = detail1.iloc[1500:,:] ##取出detail1的1500后的数据
print(‘内连接纵向合并后的数据框大小为:’,pd.concat([df3,df4],axis=1,join=‘inner’))
print(‘外连接纵向合并后的数据框大小为:’,pd.concat([df3,df4],axis=1,join=‘outer’))
print(‘append纵向堆叠后的数据框大小为:’,df3.append(df4).shape)
order = pd.read_csv(‘meal_order_info.csv’,sep=’,’,encoding=‘gb18030’) ##读取订单信息表
order[‘info_id’] = order[‘info_id’].astype(‘str’)##info_id转换为字符串格式,为合并做准备
#在订单详情表中为order_id,在订单信息表中为info_id
order_detail = pd.merge(detail1,order,left_on=‘order_id’,right_on = ‘info_id’)
print(‘订单详情表和订单信息表主键合并后的形状为:’,order_detail)
dict1 = {‘ID’:[1,2,3,4,5,6,7,8,9],
‘System’:[‘win10’,‘win10’,np.nan,‘win10’,
np.nan,np.nan,‘win7’,‘win7’,‘win8’],
‘cpu’:[‘i7’,‘i5’,np.nan,‘i7’,np.nan,np.nan,‘i5’,‘i5’,‘i3’]}
dict2 = {‘ID’:[1,2,3,4,5,6,7,8,9],
‘System’:[np.nan,np.nan,‘win7’,np.nan,
‘win8’,‘win7’,np.nan,np.nan,np.nan],
‘cpu’:[np.nan,np.nan,‘i3’,np.nan,‘i7’,
‘i5’,np.nan,np.nan,np.nan]}
df5 = pd.DataFrame(dict1)
df6 = pd.DataFrame(dict2)
print(‘经过重叠合并后的数据为:\n’,df5.combine_first(df6))
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
detail1 = pd.read_sql(‘meal_order_detail1’,conn)
detail2 = pd.read_sql(‘meal_order_detail2’,conn)
detail3 = pd.read_sql(‘meal_order_detail3’,conn)
detail = detail1.append(detail2)
detail = detail.append(detail3)
#(2779, 19)
#(3647, 19)
#(3611, 19)
print(‘三张订单详情表合并后的形状为:’, detail.shape)
#(10037, 19)
order = pd.read_csv(‘meal_order_info.csv’,sep=’,’,encoding=‘gb18030’) ##读取订单信息表
user = pd.read_excel(‘users_info.xlsx’) ##读取用户信息表
order[‘info_id’] = order[‘info_id’].astype(‘str’)
order[‘emp_id’] = order[‘emp_id’].astype(‘str’)
user[‘USER_ID’] = user[‘USER_ID’].astype(‘str’)
data = pd.merge(detail,order,left_on=[‘order_id’,‘emp_id’],right_on = [‘info_id’,‘emp_id’])
data = pd.merge(data,user,left_on=‘emp_id’,right_on = ‘USER_ID’,how = ‘inner’)
print(‘三张表数据主键合并后的大小为:’,data.shape)
dish_set = set(dishes) ##利用set的特性去重
dishes_name = detail['dishes_name'].drop_duplicates()
shapeDet = detail.drop_duplicates(subset = ['order_id','emp_id']).shape
## 求取销量和售价的相似度
corrDet = detail[['counts','amounts']].corr(method='kendall')
print('销量和售价的kendall相似度为:\n',corrDet)
corrDet1 = detail[['dishes_name','counts','amounts']].corr(method='pearson')
print('菜品名称,销量和售价的pearson相似度为:\n',corrDet1)
def FeatureEquals(df):
dfEquals=pd.DataFrame([],columns=df.columns,index=df.columns)
for i in df.columns:
for j in df.columns:
dfEquals.loc[i,j]=df.loc[:,i].equals(df.loc[:,j])
return dfEquals
detEquals=FeatureEquals(detail)
print(‘detail的特征相等矩阵的前5行5列为:\n’,detEquals.iloc[:5,:5])
lenDet = detEquals.shape[0]
dupCol = []
for k in range(lenDet):
for l in range(k+1,lenDet):
if detEquals.iloc[k,l] & (detEquals.columns[l] not in dupCol):
dupCol.append(detEquals.columns[l])
print(‘需要删除的列为:’,dupCol)
detail.drop(dupCol,axis=1,inplace=True)
print(‘删除多余列后detail的特征数目为:’,detail.shape[1])
print(‘detail每个特征缺失的数目为:\n’,detail.isnull().sum())
print(‘detail每个特征非缺失的数目为:\n’,detail.notnull().sum())
print(‘去除缺失的列后detail的形状为:’,detail.dropna(axis = 1,how =‘any’).shape)
detail = detail.fillna(-99)
print(‘detail每个特征缺失的数目为:\n’,detail.isnull().sum())
import numpy as np
from scipy.interpolate import interp1d
x=np.array([1,2,3,4,5,8,9,10]) ##创建自变量x
y1=np.array([2,8,18,32,50,128,162,200]) ##创建因变量y1
y2=np.array([3,5,7,9,11,17,19,21]) ##创建因变量y2
LinearInsValue1 = interp1d(x,y1,kind=‘linear’) ##线性插值拟合x,y1
LinearInsValue2 = interp1d(x,y2,kind=‘linear’) ##线性插值拟合x,y2
print(‘当x为6、7时,使用线性插值y1为:’,LinearInsValue1([6,7]))
print(‘当x为6、7时,使用线性插值y2为:’,LinearInsValue2([6,7]))
from scipy.interpolate import lagrange
LargeInsValue1 = lagrange(x,y1) ##拉格朗日插值拟合x,y1
LargeInsValue2 = lagrange(x,y2) ##拉格朗日插值拟合x,y2
print(‘当x为6,7时,使用拉格朗日插值y1为:’,LargeInsValue1([6,7]))
print(‘当x为6,7时,使用拉格朗日插值y2为:’,LargeInsValue2([6,7]))
##样条插值
from scipy.interpolate import spline
SplineInsValue1 = spline(x,y1,xnew=np.array([6,7]))##样条插值拟合x,y1
SplineInsValue2 = spline(x,y2,xnew=np.array([6,7]))##样条插值拟合x,y2
print(‘当x为6,7时,使用样条插值y1为:’,SplineInsValue1)
print(‘当x为6,7时,使用样条插值y2为:’,SplineInsValue2)
def outRange(Ser1):
boolInd = (Ser1.mean()-3Ser1.std()>Ser1) | (Ser1.mean()+3Ser1.var()< Ser1)
index = np.arange(Ser1.shape[0])[boolInd]
outrange = Ser1.iloc[index]
return outrange
outlier = outRange(detail[‘counts’])
print(‘使用拉依达准则判定异常值个数为:’,outlier.shape[0])
print(‘异常值的最大值为:’,outlier.max())
print(‘异常值的最小值为:’,outlier.min())
import matplotlib.pyplot as plt
plt.figure(figsize=(10,8))
p = plt.boxplot(detail[‘counts’].values,notch=True) ##画出箱线图
outlier1 = p[‘fliers’][0].get_ydata() ##fliers为异常值的标签
plt.show()
print(‘销售量数据异常值个数为:’,len(outlier1))
print(‘销售量数据异常值的最大值为:’,max(outlier1))
print(‘销售量数据异常值的最小值为:’,min(outlier1))
import pandas as pd
detail = pd.read_csv(‘detail.csv’,index_col=0,encoding = ‘gbk’)
##样本去重
detail.drop_duplicates(inplace = True)
##特征去重
def FeatureEquals(df):
##定义求取特征是否完全相同的矩阵的函数
dfEquals=pd.DataFrame([],columns=df.columns,index=df.columns)
for i in df.columns:
for j in df.columns:
dfEquals.loc[i,j]=df.loc[:,i].equals(df.loc[:,j])
return dfEquals
detEquals=FeatureEquals(detail)## 应用上述函数
##遍历所有数据
lenDet = detEquals.shape[0]
dupCol = []
for k in range(lenDet):
for l in range(k+1,lenDet):
if detEquals.iloc[k,l] & (detEquals.columns[l] not in dupCol):
dupCol.append(detEquals.columns[l])
##删除重复列
detail.drop(dupCol,axis=1,inplace=True)
print(‘进行去重操作后订单详情表的形状为:’,detail.shape)
##统计各个特征的缺失率
naRate = (detail.isnull().sum()/detail.shape[0]*100).astype(‘str’)+’%’
print(‘detail每个特征缺失的率为:\n’,naRate)
##删除全部均为缺失的列
detail.dropna(axis = 1,how = ‘all’,inplace = True)
print(‘经过缺失值处理后订单详情表各特征缺失值的数目为:\n’,detail.isnull().sum())
##定义异常值识别与处理函数
def outRange(Ser1):
QL = Ser1.quantile(0.25)
QU = Ser1.quantile(0.75)
IQR = QU-QL
Ser1.loc[Ser1>(QU+1.5IQR)] = QU
Ser1.loc[Ser1<(QL-1.5IQR)] = QL
return Ser1
detail[‘counts’] = outRange(detail[‘counts’])
detail[‘amounts’] = outRange(detail[‘amounts’])
##查看处理后的销售量和售价的最小值,最大值
print(‘销售量最小值为:’, detail[‘counts’].min())
print(‘销售量最大值为:’, detail[‘counts’].max())
print(‘售价最小值为:’, detail[‘amounts’].min())
print(‘售价最大值为:’, detail[‘amounts’].max())
import pandas as pd
import numpy as np
detail = pd.read_csv(‘detail.csv’,index_col=0,encoding = ‘gbk’)
def MinMaxScale(data):
data=(data-data.min())/(data.max()-data.min())
return data
##对菜品订单表售价和销量做离差标准化
data1=MinMaxScale(detail[‘counts’])
data2=MinMaxScale(detail [‘amounts’])
data3=pd.concat([data1,data2],axis=1)
print(‘离差标准化之后销量和售价数据为:\n’,data3.head())
def StandardScaler(data):
data=(data-data.mean())/data.std()
return data
##对菜品订单表售价和销量做标准化
data4=StandardScaler(detail[‘counts’])
data5=StandardScaler(detail[‘amounts’])
data6=pd.concat([data4,data5],axis=1)
print(‘标准差标准化之后销量和售价数据为:\n’,data6.head())
def DecimalScaler(data):
data=data/10**np.ceil(np.log10(data.abs().max()))
return data
##对菜品订单表售价和销量做标准化
data7=DecimalScaler(detail[‘counts’])
data8=DecimalScaler(detail[‘amounts’])
data9=pd.concat([data7,data8],axis=1)
print(‘小数定标标准化之后的销量和售价数据:\n’,data9.head())
import pandas as pd
import numpy as np
detail = pd.read_csv(‘detail.csv’,encoding = ‘gbk’)
data=detail.loc[0:5,‘dishes_name’] ##抽取部分数据做演示
print(‘哑变量处理前的数据为:\n’,data)
print(‘哑变量处理后的数据为:\n’,pd.get_dummies(data))
price = pd.cut(detail[‘amounts’],5)
print(‘离散化后5条记录售价分布为:\n’ ,price.value_counts())
def SameRateCut(data,k):
w=data.quantile(np.arange(0,1+1.0/k,1.0/k))
data=pd.cut(data,w)
return data
result=SameRateCut(detail[‘amounts’],5).value_counts() ##菜品售价等频法离散化
print(‘菜品数据等频法离散化后各个类别数目分布状况为:’,’\n’,result)
def KmeanCut(data,k):
from sklearn.cluster import KMeans #引入KMeans
kmodel=KMeans(n_clusters=k) #建立模型
kmodel.fit(data.values.reshape((len(data), 1))) #训练模型
c=pd.DataFrame(kmodel.cluster_centers_).sort_values(0) #输出聚类中心并排序
w=c.rolling(2).mean().iloc[1:] #相邻两项求中点,作为边界点
w=[0]+list(w[0])+[data.max()] #把首末边界点加上
data=pd.cut(data,w)
return data
#菜品售价等频法离散化
result=KmeanCut(detail[‘amounts’],5).value_counts()
print(‘菜品售价聚类离散化后各个类别数目分布状况为:’,’\n’,result)