python 数据分析

数据分组(按照定类数据对定量数据进行切割,并输出到指定路径)


import pandas as pd
df = pd.read_csv('hospital2_data1_pro.csv')
grouped = df.groupby(['既往应用节育器情况', '使用节育器型号情况'])
for group_name, group_data in grouped:
    file_name = f"{group_name[0]}_{group_name[1]}.xlsx"
    file_path = f"D:\\Desktop\\{file_name}"
    group_data.to_excel(file_path, index=False)

快速数据处理(剔除、变换)


mask = (df['Delivering city'] == DC) & (df['Receiving city'] == RC)
df = df.loc[mask]

快速数据处理_时间(提取)


import pandas as pd
startTime = '2022-07-01'
endTime = '2022-09-30'
start_date = pd.to_datetime(startTime)
end_date = pd.to_datetime(endTime)
mask = (df['Delivering city'] >= start_date) & (df['Receiving city'] <= end_date)
df = df.loc[mask]
df.to_csv('result.csv',index = False)

折线图小技巧


plt.xticks(rotation = 90,fontsize = 12)
plt.xticks(range(1,len(ind),5))
plt.legend(fontsize=18)
plt.text(maxs,tAveData[maxs],round(PreThreshold[maxs],2),fontsize = 20) #保留两位小数

缺失值处理


import pandas as pd
df = pd.read_csv('anormal.csv')
missing_values = df.isna().sum()
missing_value_ratio = (missing_values / len(df)).round(4) * 100
for column, ratio in missing_value_ratio.items():
    print(f"{column}: {ratio:.2f}%")
一、频数直方图(换算成比率)

import pandas as pd
from pandas import DataFrame
import numpy as np
import numpy.random as npr
import matplotlib.pyplot as plt
from scipy.stats import norm

data = pd.read_csv('订单1.csv')#改文件名即可
x = data['profit'].tolist()

plt.rcParams['figure.figsize']=10,6
plt.hist(x,bins=50,density=True,color='SkyBlue',edgecolor='b',alpha=0.6)

overlay = np.linspace(min(x),max(x),2000)
mean,std = norm.fit(x)
pdf = norm.pdf(overlay,mean,std)
plt.plot(overlay,pdf,'r-')

plt.savefig("profit.png") 
二、词云图

import jieba
import wordcloud
import urllib.request
from imageio import imread

infile = open('word.txt','r',encoding='utf-8')
t=infile.read()

TextFile=open("delete.txt","rt",encoding="utf-8")
stopwords=TextFile.read().splitlines()
TextFile.close()

mask=imread("background.jpg")

ls=jieba.lcut(t)
txt="".join(ls)
w=wordcloud.WordCloud(font_path="msyh.ttc",mask=mask,width=1000,height=700,background_color="white",stopwords=stopwords)
w.generate(txt)
w.to_file("result.png")
三、数据标准化处理

import numpy as np
minValue = np.min(tAveData)
maxValue = np.max(tAveData)
AveData = (tAveData - minValue) / (maxValue - minValue)
四、文件类型转化

import numpy as np
import pandas as pd
import csv

#转化文件类型 
out = open('train_data.csv','w',newline='')        #要转成的.csv文件,先创建一个LF1big.csv文件
csv_writer = csv.writer(out,dialect='excel')  

f = open("train_data.txt","r")
for line in f.readlines():
    line=line.replace(' ','\t')  #将每行的逗号替换成空格
    list = line.split()          #将字符串转为列表,从而可以按单元格写入csv
    csv_writer.writerow(list)
五、ARIMA时间序列预测

import numpy as np
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
import statsmodels.tsa.stattools as ts

matplotlib.rc("font", family='Microsoft YaHei')
#读取信息
infodata = pd.read_excel('data.xlsx')
Year = infodata['Year'].tolist()

pre = 10 #向后预测的值
for i in range(0,pre):
    Year.append(2022+i)

for i in range(1,25):
    predata = infodata['X'+str(i)].tolist()
    for j in range(0,pre):
        model = ARIMA(predata, order=(2, 0, 1))
        model_fit = model.fit()
        # 预测
        yhat = model_fit.predict(len(predata), len(predata))
        predata.append(float(yhat))
        result =np.round(predata)
        print(result)
六、XGBoost多因素预测

import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.multioutput import MultiOutputRegressor 
from xgboost import XGBRegressor
data=pd.read_excel('data1.xlsx')
Valuename = list(data.columns)
print(Valuename)
valuePre = []
for i in range(0,4):
    valuePre.append(Valuename[i])
data_y = data.iloc[:,0:4]
data_x = data.iloc[:,4:]
#开始预测
x_train, x_test, y_train, y_test= train_test_split(data_x, data_y, test_size= 0.2, random_state=0)  
data = MultiOutputRegressor(XGBRegressor(objective='reg:linear'))
data.fit(x_train, y_train)
yu=pd.read_excel('data3.xlsx')
predict=data.predict(yu)
result = pd.DataFrame(predict,columns = valuePre)
display(result)
result.to_excel('result_predict.xlsx')
七、计算相关性系数(Spearman和Pearson)

Spearman相关系数


from scipy.stats import spearmanr
corr, pValue = spearmanr(PreThreshold, AveDataSquare)
print("斯皮尔曼相关系数:", corr, " P值:", pValue)

Pearson相关系数


from scipy.stats import pearsonr
corr, pValue = pearsonr(A, B)
print("皮尔逊相关系数:", corr, " P值:", pValue)

具体相关实例


import pandas as pd 
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rc("font", family='Microsoft YaHei')
infodata = pd.read_excel('data2_Stand.xlsx')
xname = infodata.columns
txname = []
for i in range(5,len(xname)):
    txname.append(xname[i])
# txname.append('self')
df = pd.DataFrame()
df_result = pd.DataFrame()
df_result['variable'] = txname
for i in range(5,len(xname)):
    df['X'+str(i-4)] = infodata[str(xname[i])].tolist()
plt.figure(figsize=(20, 14))
plt.style.use('ggplot')
fig,axes = plt.subplots(ncols = 4)
ind = np.arange(0,len(txname))
linelst = []
for i in range(1,5):
    df['self'] = infodata[str(xname[i])].tolist()
    Hotdata =df.corr(method='spearman')
    line = Hotdata['self'].tolist()
    line = line[0:-1]
    linelst.append(line)
    df_result[str(xname[i])] = line
plt.figure(figsize=(10,6), dpi=80)
plt.figure(1)
ax1 = plt.subplot(141)
ax1.barh(txname,linelst[0],color = 'LightSkyBlue',edgecolor='b',alpha = 0.8)
plt.yticks(fontsize = 4)
plt.xticks(fontsize = 6)
ax2 = plt.subplot(142)
ax2.barh(ind,linelst[1],color = 'LightSkyBlue',edgecolor='b',alpha = 0.8)
plt.yticks(fontsize = 6)
plt.xticks(fontsize = 6)
ax3 = plt.subplot(143)
ax3.barh(ind,linelst[2],color = 'LightSkyBlue',edgecolor='b',alpha = 0.8)
plt.yticks(fontsize = 6)
plt.xticks(fontsize = 6)
ax4 = plt.subplot(144)
ax4.barh(ind,linelst[3],color = 'LightSkyBlue',edgecolor='b',alpha = 0.8)
plt.yticks(fontsize = 6)
plt.xticks(fontsize = 6)
plt.show()
df_result.to_excel('result.xlsx')
八、频数统计

def count(str,word):
    num = 0
    pos = 0
    while pos!=len(str):
        pos = str.find(word,pos)
        pos = pos + len(word)
        num = num + 1
    return num
九、数据缺失值剔除

import pandas as pd
# 导入数据集
data = pd.read_excel('Monohulled_Sailboats.xlsx')
# 计算每列数据的缺失值占比并输出
print("The proportion of missing values per column:")
null_percentages = (data.isnull().sum() / len(data)) * 100
print(null_percentages.apply(lambda x: '{:.2f}%'.format(x)))
# 对缺失值占比低于10%的列对应的缺失值进行剔除
clean_data = data.dropna(thresh=len(data)*0.9, axis=1)

你可能感兴趣的:(python,数据分析,pandas)