20数学建模校赛C题

首先感谢我的队友,给出了很多关键的思路和想法,论文也基本没让我上手

题就是这么个题 拿过来还觉得贼费劲

20数学建模校赛C题_第1张图片

先上个思路,不好意思让你们在这里还要看我手写的东西

思路就是这么个思路 弄出来也贼费劲

特别是发现异常的过程,数据,作图,分组,发现问题,删一部分;数据,作图,分组,发现问题,删一部分……
这个过程跟队友一起搞了好久
这部分多且杂,这篇比较系统,为了一个清晰的思路明天再总结总结数据处理发现问题的过程(啊代码也是)

大佬说,最重要的是数据。数据和特征决定了机器学习的上限,而模型和算法只是逼近这个上限

懵懵懂懂,毕竟第四题的模型效果辣眼睛,也不知道是不是数据的问题,但是如果是数据的问题,明明第三题还阔以哇!!!

木事,听说哈梅巴赫也是在华尔街做宽客时经历了一次公司停电丢数据才意识到

但愿有生之年也有机会能像他一样潇洒

再次感谢我的队友,好的下面上思路
20数学建模校赛C题_第2张图片
20数学建模校赛C题_第3张图片

图片还转不过来真不好意思

代码也就这些代码 敲出来也贼费劲

一.附件一 数据清洗

1.	#导包,导入数据
2.	import pandas as pd  
3.	import numpy as np  
4.	import matplotlib.pyplot as plt  
5.	from pandas import DataFrame,Series  
6.	import seaborn as sns  
7.	from sklearn.linear_model import LinearRegression  
8.	from sklearn.model_selection import train_test_split  
9.	data=pd.read_csv(r"D:/markdown/数学建模/附件3.3:2020校赛C题/2020校赛C题/附件/list1.csv",encoding="gbk")  
10.	data.shape  
11.	  
12.	
13.	del data["Unnamed: 20"]  
14.	del data["Unnamed: 19"]  
15.	del data["Unnamed: 18"]  
16.	del data["Unnamed: 17"]  
17.	del data["Unnamed: 16"]  
18.	data  
19.	  
20.	
21.	#删除空值  
22.	data=data.dropna(axis=0,how='any')  
23.	data.shape  
24.	  
25.	
26.	#删除温差<=0的行  
27.	data= data.drop(data[(data['dtemp']<=0)].index)  
28.	data.shape  
29.	  
30.	
31.	#删除流量/功率=0的行(一样的)  
32.	data= data.drop(data[(data['POWER']<=0)].index)  
33.	data.shape  
34.	  
35.	
36.	#删除室温,设置室温都=0的行  
37.	data= data.drop(data[(data['ROOM_TEMPERATURE']==0) & (data['ROOM_SET_TEMPERATURE']==0)].index)  
38.	data.shape  
39.	  
40.	
41.	#删除阀门全关,流速不为0的行  
42.	data= data.drop(data[(data['FLOW_SPEED']!=0) & (data['OPEN_STATUS']=='全关')].index)  
43.	data.shape  
44.	  
45.	
46.	#查看室温的均值,标准差,以均值三倍标准差以外作为异常界限  
47.	mean, std = data["ROOM_TEMPERATURE"].mean(), data["ROOM_TEMPERATURE"].std()  
48.	
49.	lower, upper = mean - 3 * std, mean + 3 * std  
50.	print("均值:", mean)  
51.	print("标准差:", std)  
52.	print("下限:", lower)  
53.	print("上限:", upper)  
54.	data["ROOM_TEMPERATURE"][(data["ROOM_TEMPERATURE"] < lower) | (data["ROOM_TEMPERATURE"] > upper)]  
55.	  
56.	
57.	#箱线图  
58.	sns.boxplot(data=data["ROOM_TEMPERATURE"])  
59.	  
60.	
61.	#删除超过界限的数据  
62.	data = data.drop(data[(data["ROOM_TEMPERATURE"] < lower) | (data["ROOM_TEMPERATURE"] > upper)].index)  
63.	  
64.	
65.	#功率同理  
66.	mean, std = data["POWER"].mean(), data["POWER"].std()  
67.	lower, upper = mean - 3 * std, mean + 3 * std  
68.	print("均值:", mean)  
69.	print("标准差:", std)  
70.	print("下限:", lower)  
71.	print("上限:", upper)  
72.	data["POWER"][(data["POWER"] < lower) | (data["POWER"] > upper)]  
73.	  
74.	
75.	sns.boxplot(data=data["POWER"])  
76.	  
77.	
78.	data = data.drop(data[(data["POWER"] < lower) | (data["POWER"] > upper)].index)  
79.	data  
80.	  
81.	
82.	# 按时间排序  
83.	data.sort_values("SYS_READ_TIME",inplace=True)  
84.	data  
85.	  
86.	
87.	# 保存  
88.	data  
89.	data.to_csv('./data.csv',encoding="gbk") 

二.附件二 填充整理

1.	# 导包,导入文件  
2.	import pandas as pd  
3.	import numpy as np  
4.	import matplotlib.pyplot as plt  
5.	data2=pd.read_csv(r"D:/markdown/数学建模/附件3.3:2020校赛C题/2020校赛C题/附件/list2.csv",encoding="gbk")  
6.	data2.shape  
7.	  
8.	  
9.	#删除中间的空值列  
10.	del data2['Unnamed: 1']  
11.	  
12.	  
13.	# 时间转化为整形  
14.	def time_trans(time_ch):  
15.	    year = time_ch.split('/')[0]  
16.	    month = time_ch.split('/')[1]  
17.	    day = time_ch.split('/')[2].split(' ')[0]  
18.	    h = time_ch.split(' ')[1].split(':')[0]  
19.	    m = time_ch.split(':')[1]  
20.	      
21.	    if len(month) == 1:   
22.	        month = '0'+ month  
23.	    if len(day) == 1:  
24.	        day = '0' + day  
25.	    if len(h) == 1:  
26.	        h = '0' + h  
27.	          
28.	    new_date = (year +  month + day + h + m )  
29.	    return int(new_date)  
30.	  
31.	time_trans('2017/12/1 5:30')  
32.	  
33.	  
34.	# 转化成整形的时间加入一个新列表  
35.	new_time=[]  
36.	for i in list(data2['时间']):  
37.	    new_time.append(time_trans(i))  
38.	#     i+=1  
39.	len(list(set(new_time)))  
40.	  
41.	  
42.	# 温度转换为列表  
43.	new_temp=list(data2['温度'])  
44.	new_temp  
45.	  
46.	  
47.	# 合并为一个新的DataFrame  
48.	from pandas.core.frame import DataFrame  
49.	new_time_temp={'new_time' : new_time,  
50.	   'new_temp' : new_temp}  
51.	time_temp = DataFrame(new_time_temp)  
52.	time_temp  
53.	  
54.	  
55.	# 在时间列每两行之间插入29行,在前一行的基础上+1  
56.	# 把原来半小时的时间间隔转换为一分钟  
57.	for i in new_time:  
58.	    time=i  
59.	    for j in range(29):  
60.	        insertRow = pd.DataFrame([(time+1)],columns = ['new_time'])  
61.	        above = time_temp.loc[:j]  
62.	        below = time_temp.loc[(j+1):]  
63.	        time_temp = above.append(insertRow,ignore_index=True).append(below,ignore_index=True)  
64.	        j+=1   
65.	        time+=1  
66.	    i+=1  
67.	time_temp  
68.	  
69.	  
70.	# 按新的时间排序  
71.	time_temp.sort_values(by="new_time" , ascending=True,inplace=True)  
72.	time_temp  
73.	  
74.	  
75.	# 删除之前的索引  
76.	time_temp = time_temp.reset_index(drop=True)  
77.	  
78.	  
79.	# 温度列以上面最近的有效值填充  
80.	time_temp = time_temp.fillna(method='ffill')  
81.	time_temp  
82.	  
83.	  
84.	# 把整形的时间转换回去,并放在一个新列表里  
85.	def time_trans1(time_ch):  
86.	    time_ch = str(time_ch)  
87.	    year = time_ch[:4]  
88.	    month = time_ch[4:6]  
89.	    day = time_ch[6:8]  
90.	    h = time_ch[8:10]  
91.	    m = time_ch[10:]  
92.	      
93.	    year = year + '/'  
94.	    month = month + '/'  
95.	    day = day + ' '  
96.	    h = h + ':'  
97.	          
98.	    new_date = (year +  month + day + h + m )  
99.	    return (new_date)  
100.	  
101.	new_time1=[]  
102.	for i in list(time_temp['new_time']):  
103.	    new_time1.append(time_trans1(i))  
104.	#     i+=1  
105.	len(list(new_time1))  
106.	  
107.	  
108.	# 新的温度也放一个列表  
109.	new_temp1=list(time_temp['new_temp'])  
110.	new_temp1  
111.	  
112.	  
113.	# 合并一个新DataFrame  
114.	data={'时间' : new_time1,  
115.	   '温度' : new_temp1}  
116.	data2 = DataFrame(data)  
117.	data2  
118.	  
119.	  
120.	# 保存  
121.	data2.to_csv('./data2.csv',encoding="gbk") 

三.数据集 合并整理

1.	#导包,导入数据  
2.	import pandas as pd  
3.	import numpy as np  
4.	import matplotlib.pyplot as plt  
5.	data=pd.read_csv(r"D:/markdown/数学建模/附件3.3:2020校赛C题/2020校赛C题/data.csv",encoding="gbk")  
6.	data2=pd.read_csv(r"D:/markdown/数学建模/附件3.3:2020校赛C题/2020校赛C题/data2.csv",encoding="gbk")  
7.	data.shape  
8.	data2.shape  
9.	  
10.	  
11.	数据2的时间列名改为和数据1统一  
12.	data2.rename(columns={'时间':'SYS_READ_TIME'},inplace=True)   
13.	data2  
14.	  
15.	  
16.	# 以时间为索引左连接合并数据据  
17.	data1 = pd.merge(data,data2,how = 'left',on='SYS_READ_TIME')  
18.	data1.shape  
19.	  
20.	  
21.	# 删除有空值的行(室外温度缺失)  
22.	data1=data1.dropna(axis=0,how='any')  
23.	data1.shape  
24.	  
25.	  
26.	# 删除多出来的索引列  
27.	del data1['Unnamed: 0_y']  
28.	del data1['Unnamed: 0_x']  
29.	  
30.	  
31.	# 保存  
32.	data1.to_csv('./data3.csv',encoding="gbk")  

四.线性回归

1.	# 导包,导入数据  
2.	import pandas as pd  
3.	import numpy as np  
4.	import matplotlib.pyplot as plt  
5.	from pandas import DataFrame,Series  
6.	import seaborn as sns  
7.	from sklearn.linear_model import LinearRegression  
8.	from sklearn.model_selection import train_test_split  
9.	data=pd.read_csv(r"D:/markdown/数学建模/附件3.3:2020校赛C题/2020校赛C题/data3.csv",encoding="gbk")  
10.	data.shape  
11.	  
12.	  
13.	# 查看功率与温度的协方差,相关系数  
14.	X ,y= data['POWER'],data['ROOM_TEMPERATURE']  
15.	print("协方差:", X.cov(y))  
16.	print("相关系数:", X.corr(y))  
17.	  
18.	  
19.	# 查看室外温度与温度的协方差,相关系数  
20.	X ,y= data['温度'],data['ROOM_TEMPERATURE']  
21.	print("协方差:", X.cov(y))  
22.	print("相关系数:", X.corr(y))  
23.	  
24.	  
25.	# 查看所有数据之间的相关系数  
26.	data.corr()  
27.	  
28.	  
29.	# 画出相关系数热图  
30.	plt.figure(figsize=(15, 10))  
31.	ax = sns.heatmap(data.corr(), cmap=plt.cm.RdYlGn, annot=True, fmt=".2f")  
32.	# 注意:Matplotlib 3.1.1版本的bug,heatmap的首行与末行会显示不全。  
33.	# 可手动调整y轴的范围来进行修复。(老版本的Matplotlib不需要调整y轴范围。)  
34.	# a, b = ax.get_ylim()  
35.	# ax.set_ylim(a + 0.5, b - 0.5)  
36.	  
37.	  
38.	# 划出30%的测试集,线性回归训练、测试模型,并查看各特征变量的权重、截距,和检验标准  
39.	X ,y= data[['POWER','温度','IN_TEMPERATURE']],data['ROOM_TEMPERATURE']  
40.	# y = data['ROOM_TEMPERATURE']  
41.	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)  
42.	lr = LinearRegression()  
43.	lr.fit(X_train, y_train)  
44.	  
45.	print("模型权重:", lr.coef_)  
46.	print("截距:", lr.intercept_)  
47.	y_hat = lr.predict(X_test)  
48.	  
49.	print("均方误差(MSE):", mean_squared_error(y_test, y_hat))  
50.	print("根均方误差(RMSE):", np.sqrt(mean_squared_error(y_test, y_hat)))  
51.	print("平均绝对值误差(MAE):", mean_absolute_error(y_test, y_hat))  
52.	print("训练集R^2:", lr.score(X_train, y_train))  
53.	print("测试集R^2:", lr.score(X_test, y_test))  
54.	  
55.	  
56.	# 分户,各划出30%的测试集,线性回归训练、测试模型,并查看各特征变量的权重、截距,和检验标准  
57.	def func_all(x):  
58.	    data2=data[(data.address_2nd==x)]  
59.	      
60.	    X ,y= data2[['POWER','IN_TEMPERATURE','温度']],data2['ROOM_TEMPERATURE']  
61.	  
62.	    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)  
63.	    lr = LinearRegression()  
64.	    lr.fit(X_train, y_train)  
65.	    print("住户:",x)  
66.	    print("模型权重:", lr.coef_)  
67.	    print("截距:", lr.intercept_)  
68.	    y_hat = lr.predict(X_test)  
69.	      
70.	    print("均方误差(MSE):", mean_squared_error(y_test, y_hat))  
71.	    print("根均方误差(RMSE):", np.sqrt(mean_squared_error(y_test, y_hat)))  
72.	    print("平均绝对值误差(MAE):", mean_absolute_error(y_test, y_hat))  
73.	    print("训练集R^2:", r2_score(y_train, lr.predict(X_train)))  
74.	    print("测试集R^2:", r2_score(y_test, y_hat))  
75.	    # socre其实求解的就是r^2的值。但是注意,r2_score方法与score方法传递参数的内容是不同的。  
76.	    print(' ')  
77.	      
78.	address = [15311251,15310819,15310804,15311234,15311289,15311072,15311061,15310846,15311065,15311342,15311245,15310966,  
79.	15311191,15311361,15310827,15311196,15311235,15311233,15310808,15311473,15310839,15310815,15310845,  
80.	15311082]  
81.	for i in range(len(address)):  
82.	    x = address[i]  
83.	    func_all(x)  
84.	    i+=1 
85.	
86.	
87.	#将分户得到的数据写入csv文件  
88.	  
89.	import csv  
90.	  
91.	#python2可以用file替代open  
92.	with open("result.csv","w") as csvfile:   
93.	    writer = csv.writer(csvfile)  
94.	  
95.	    #先写入columns_name  
96.	    writer.writerow(['index','住户','模型权重','截距','回归方程','训练集R^2','测试集R^2','均方误差(MSE)','根均方误差(RMSE)','"平均绝对值误差(MAE)'])  
97.	  
98.	    def all(x):  
99.	        data2=data[(data.address_2nd==x)]  
100.	  
101.	        X ,y= data2[['POWER','IN_TEMPERATURE','温度']],data2['ROOM_TEMPERATURE']  
102.	  
103.	        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)  
104.	        lr = LinearRegression()  
105.	        lr.fit(X_train, y_train)  
106.	        y_hat = lr.predict(X_test)  
107.	        df1=x   
108.	        df2=lr.coef_  
109.	        df3=lr.intercept_  
110.	        list(df2)  
111.	        df4=('y = %f x1 + %f x2 +%f x3 +%f'%(df2[0],df2[1],df2[2],df3))  
112.	        df5=r2_score(y_train, lr.predict(X_train))  
113.	        df6=r2_score(y_test, y_hat)  
114.	        df7=mean_squared_error(y_test, y_hat)  
115.	        df8=np.sqrt(mean_squared_error(y_test, y_hat))  
116.	        df9=mean_absolute_error(y_test, y_hat)  
117.	          
118.	        writer.writerows([[i,df1,df2,df3,df4,df5,df6,df7,df8,df9]])  
119.	    address = [15311251,15310819,15310804,15311234,15311289,15311072,15311061,15310846,15311065,15311342,15311245,15310966,  
120.	    15311191,15311361,15310827,15311196,15311235,15311233,15310808,15311473,15310839,15310815,15310845,  
121.	    15311082]  
122.	    for i in range(len(address)):  
123.	        x = address[i]  
124.	        all(x)  
125.	        for i in address:  
126.	            df1=x   
127.	            df2=lr.coef_  
128.	            df3=lr.intercept_  
129.	            df4=r2_score(y_train, lr.predict(X_train))  
130.	        i+=1  

论文已提交,代码可以粘了黑黑黑
代码是粘的整理到word的附件,就出现了两排行号,难受,但懒得删了

你可能感兴趣的:(python#数据分析,python,数学建模,数据分析)