数据分组(按照定类数据对定量数据进行切割,并输出到指定路径)
import pandas as pd
df = pd.read_csv('hospital2_data1_pro.csv')
grouped = df.groupby(['既往应用节育器情况', '使用节育器型号情况'])
for group_name, group_data in grouped:
file_name = f"{group_name[0]}_{group_name[1]}.xlsx"
file_path = f"D:\\Desktop\\{file_name}"
group_data.to_excel(file_path, index=False)
快速数据处理(剔除、变换)
mask = (df['Delivering city'] == DC) & (df['Receiving city'] == RC)
df = df.loc[mask]
快速数据处理_时间(提取)
import pandas as pd
startTime = '2022-07-01'
endTime = '2022-09-30'
start_date = pd.to_datetime(startTime)
end_date = pd.to_datetime(endTime)
mask = (df['Delivering city'] >= start_date) & (df['Receiving city'] <= end_date)
df = df.loc[mask]
df.to_csv('result.csv',index = False)
折线图小技巧
plt.xticks(rotation = 90,fontsize = 12)
plt.xticks(range(1,len(ind),5))
plt.legend(fontsize=18)
plt.text(maxs,tAveData[maxs],round(PreThreshold[maxs],2),fontsize = 20) #保留两位小数
缺失值处理
import pandas as pd
df = pd.read_csv('anormal.csv')
missing_values = df.isna().sum()
missing_value_ratio = (missing_values / len(df)).round(4) * 100
for column, ratio in missing_value_ratio.items():
print(f"{column}: {ratio:.2f}%")
import pandas as pd
from pandas import DataFrame
import numpy as np
import numpy.random as npr
import matplotlib.pyplot as plt
from scipy.stats import norm
data = pd.read_csv('订单1.csv')#改文件名即可
x = data['profit'].tolist()
plt.rcParams['figure.figsize']=10,6
plt.hist(x,bins=50,density=True,color='SkyBlue',edgecolor='b',alpha=0.6)
overlay = np.linspace(min(x),max(x),2000)
mean,std = norm.fit(x)
pdf = norm.pdf(overlay,mean,std)
plt.plot(overlay,pdf,'r-')
plt.savefig("profit.png")
import jieba
import wordcloud
import urllib.request
from imageio import imread
infile = open('word.txt','r',encoding='utf-8')
t=infile.read()
TextFile=open("delete.txt","rt",encoding="utf-8")
stopwords=TextFile.read().splitlines()
TextFile.close()
mask=imread("background.jpg")
ls=jieba.lcut(t)
txt="".join(ls)
w=wordcloud.WordCloud(font_path="msyh.ttc",mask=mask,width=1000,height=700,background_color="white",stopwords=stopwords)
w.generate(txt)
w.to_file("result.png")
import numpy as np
minValue = np.min(tAveData)
maxValue = np.max(tAveData)
AveData = (tAveData - minValue) / (maxValue - minValue)
import numpy as np
import pandas as pd
import csv
#转化文件类型
out = open('train_data.csv','w',newline='') #要转成的.csv文件,先创建一个LF1big.csv文件
csv_writer = csv.writer(out,dialect='excel')
f = open("train_data.txt","r")
for line in f.readlines():
line=line.replace(' ','\t') #将每行的逗号替换成空格
list = line.split() #将字符串转为列表,从而可以按单元格写入csv
csv_writer.writerow(list)
import numpy as np
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
import statsmodels.tsa.stattools as ts
matplotlib.rc("font", family='Microsoft YaHei')
#读取信息
infodata = pd.read_excel('data.xlsx')
Year = infodata['Year'].tolist()
pre = 10 #向后预测的值
for i in range(0,pre):
Year.append(2022+i)
for i in range(1,25):
predata = infodata['X'+str(i)].tolist()
for j in range(0,pre):
model = ARIMA(predata, order=(2, 0, 1))
model_fit = model.fit()
# 预测
yhat = model_fit.predict(len(predata), len(predata))
predata.append(float(yhat))
result =np.round(predata)
print(result)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor
data=pd.read_excel('data1.xlsx')
Valuename = list(data.columns)
print(Valuename)
valuePre = []
for i in range(0,4):
valuePre.append(Valuename[i])
data_y = data.iloc[:,0:4]
data_x = data.iloc[:,4:]
#开始预测
x_train, x_test, y_train, y_test= train_test_split(data_x, data_y, test_size= 0.2, random_state=0)
data = MultiOutputRegressor(XGBRegressor(objective='reg:linear'))
data.fit(x_train, y_train)
yu=pd.read_excel('data3.xlsx')
predict=data.predict(yu)
result = pd.DataFrame(predict,columns = valuePre)
display(result)
result.to_excel('result_predict.xlsx')
Spearman相关系数
from scipy.stats import spearmanr
corr, pValue = spearmanr(PreThreshold, AveDataSquare)
print("斯皮尔曼相关系数:", corr, " P值:", pValue)
Pearson相关系数
from scipy.stats import pearsonr
corr, pValue = pearsonr(A, B)
print("皮尔逊相关系数:", corr, " P值:", pValue)
具体相关实例
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rc("font", family='Microsoft YaHei')
infodata = pd.read_excel('data2_Stand.xlsx')
xname = infodata.columns
txname = []
for i in range(5,len(xname)):
txname.append(xname[i])
# txname.append('self')
df = pd.DataFrame()
df_result = pd.DataFrame()
df_result['variable'] = txname
for i in range(5,len(xname)):
df['X'+str(i-4)] = infodata[str(xname[i])].tolist()
plt.figure(figsize=(20, 14))
plt.style.use('ggplot')
fig,axes = plt.subplots(ncols = 4)
ind = np.arange(0,len(txname))
linelst = []
for i in range(1,5):
df['self'] = infodata[str(xname[i])].tolist()
Hotdata =df.corr(method='spearman')
line = Hotdata['self'].tolist()
line = line[0:-1]
linelst.append(line)
df_result[str(xname[i])] = line
plt.figure(figsize=(10,6), dpi=80)
plt.figure(1)
ax1 = plt.subplot(141)
ax1.barh(txname,linelst[0],color = 'LightSkyBlue',edgecolor='b',alpha = 0.8)
plt.yticks(fontsize = 4)
plt.xticks(fontsize = 6)
ax2 = plt.subplot(142)
ax2.barh(ind,linelst[1],color = 'LightSkyBlue',edgecolor='b',alpha = 0.8)
plt.yticks(fontsize = 6)
plt.xticks(fontsize = 6)
ax3 = plt.subplot(143)
ax3.barh(ind,linelst[2],color = 'LightSkyBlue',edgecolor='b',alpha = 0.8)
plt.yticks(fontsize = 6)
plt.xticks(fontsize = 6)
ax4 = plt.subplot(144)
ax4.barh(ind,linelst[3],color = 'LightSkyBlue',edgecolor='b',alpha = 0.8)
plt.yticks(fontsize = 6)
plt.xticks(fontsize = 6)
plt.show()
df_result.to_excel('result.xlsx')
def count(str,word):
num = 0
pos = 0
while pos!=len(str):
pos = str.find(word,pos)
pos = pos + len(word)
num = num + 1
return num
import pandas as pd
# 导入数据集
data = pd.read_excel('Monohulled_Sailboats.xlsx')
# 计算每列数据的缺失值占比并输出
print("The proportion of missing values per column:")
null_percentages = (data.isnull().sum() / len(data)) * 100
print(null_percentages.apply(lambda x: '{:.2f}%'.format(x)))
# 对缺失值占比低于10%的列对应的缺失值进行剔除
clean_data = data.dropna(thresh=len(data)*0.9, axis=1)