Step1: 数据处理
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
## 数据初始化
# 企业信息
data_1 = pd.read_excel(r'C:\Users\Dave Young\Desktop\C\附件1.xlsx','企业信息')
# 进项发票信息
data_2 = pd.read_excel(r'C:\Users\Dave Young\Desktop\C\附件1.xlsx','进项发票信息')
# 销项发票信息
data_3 = pd.read_excel(r'C:\Users\Dave Young\Desktop\C\附件1.xlsx','销项发票信息')
# 企业评级数据
rate = [] # 用于记录评级数据
for i in range(123):
if data_1.iloc[i,2] == 'A':
rate.append(4)
elif data_1.iloc[i,2] == 'B':
rate.append(3)
elif data_1.iloc[i,2] == 'C':
rate.append(2)
elif data_1.iloc[i,2] == 'D':
rate.append(1)
rate = pd.DataFrame(rate)
rate.to_excel('rate.xlsx')
# 有效发票和作废发票
data_2valid = data_2.loc[data_2['发票状态'] == '有效发票',:]
data_2invalid = data_2.loc[data_2['发票状态'] == '作废发票',:]
data_3valid = data_3.loc[data_3['发票状态'] == '有效发票',:]
data_3invalid = data_3.loc[data_3['发票状态'] == '作废发票',:]
# 有效票各企业价税合计总值统计
data2_total = [] # 记录各企业进项发票的价税合计的总值
data3_total = [] # 记录各企业销项发票的价税合计的总值
for i in range(1,124):
data = data_2valid.loc[data_2valid['企业代号'] == 'E'+str(i),['价税合计']]
sum_per = sum(data.iloc[:,[0]].values)
data2_total.append(sum_per)
for j in range(1,124):
data = data_3valid.loc[data_3valid['企业代号'] == 'E'+str(j),['价税合计']]
sum_per = sum(data.iloc[:,[0]].values)
data3_total.append(sum_per)
data3_total[107] = np.array([0])
data2_total = pd.DataFrame(data2_total)
data3_total = pd.DataFrame(data3_total)
data2_total.to_excel('data1.xlsx') # 写入各企业进项总数据
data3_total.to_excel('data2.xlsx') # 写入各企业销项总数据
data2_total_mat = np.mat(data2_total.values)
data3_total_mat = np.mat(data3_total.values)
data4_valid = (data3_total_mat - data2_total_mat) / data3_total_mat
data4_valid = pd.DataFrame(data4_valid)
data5_valid = data2_total_mat - data3_total_mat
data5_valid = pd.DataFrame(data5_valid)
data5_valid.to_excel('data4.xlsx') # 周转资金数据
data4_valid.to_excel('data3.xlsx') # 写入各企业进销项差额总数据
# 废票数据处理
data2_waste_total = []
data3_waste_total = []
for i in range(1,124):
data = data_2invalid.loc[data_2invalid['企业代号'] == 'E'+str(i),['价税合计']]
sum_per = sum(data.iloc[:,[0]].values)
data2_waste_total.append(sum_per)
for j in range(1,124):
data = data_3invalid.loc[data_3invalid['企业代号'] == 'E'+str(j),['价税合计']]
sum_per = sum(data.iloc[:,[0]].values)
data3_waste_total.append(sum_per)
for k in range(1,123):
if type(data2_waste_total[k]) == int:
data2_waste_total[k] = np.array([0])
data3_waste_total[107] = np.array([0])
data2_waste_total = pd.DataFrame(data2_waste_total)
data3_waste_total = pd.DataFrame(data3_waste_total)
data2_waste_total_mat = np.mat(data2_waste_total.values)
data3_waste_total_mat = np.mat(data3_waste_total.values)
data6_mat = data2_waste_total_mat + data3_waste_total_mat
data6 = pd.DataFrame(data6_mat)
data6.to_excel('waste.xlsx')
Step2: 基于熵权法的TOPSIS对各企业进行赋分
from sklearn.impute import SimpleImputer
# 导入数据
data = pd.read_excel(r'C:\Users\Dave Young\Desktop\C\TOPSISdata.xlsx')
label_need = data.keys()[1:] # 得到数据的变量名
data1 = data[label_need].values # 得到数据矩阵
[m,n] = data1.shape # 计算行数和列数
# 数据标准化
data2 = data1.astype('float')
for j in range(0,n):
data2[:,j] = data1[:,j]/np.sqrt(sum(np.square(data1[:,j])))
data2[:,[5]] = data1[:,[5]]
for i in range(123):
for k in range(6):
if data2[i,k] < 0:
if max((data1[:,[k]]))- min(data1[:,[k]]):
data2[i,k] = 0.01
else:
data2[i,k] = (data1[i,k]-min(data1[:,[k]]))/(max((data1[:,[k]]))- min(data1[:,[k]]))
imp = SimpleImputer(missing_values=np.nan,strategy='mean') # 处理nan值
imp.fit(data2)
data2 = imp.transform(data2)
# 熵权法计算权重
p = data2
for j in range(0,n):
p[:,j] = data2[:,j]/sum(data2[:,j])
E = data2[0,:]
for j in range(0,n):
E[j] = -1/np.log(m)*sum(p[:,j]*np.log(p[:,j]+1e-5)) # 求信息熵
w = (1-E)/sum(1-E)
# 得到加权后的数据
R = data2*w
# 得到最大值最小值距离
r_max = np.max(R, axis=0) #每个指标的最大值
r_min = np.min(R,axis=0) #每个指标的最小值
d_z = np.sqrt(np.sum(np.square((R -np.tile(r_max,(m,1)))),axis=1)) #d+向量
d_f = np.sqrt(np.sum(np.square((R -np.tile(r_min,(m,1)))),axis=1)) #d-向量
# 得到评分
s = d_f/(d_z+d_f)
Score = 100*s/max(s)
score = pd.DataFrame(Score)
score.to_excel('score_problem.xlsx')
# 绘制得分图像
x = range(1,124)
plt.plot(x,Score)
plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False
plt.title('得分曲线')
plt.xlabel('企业代号')
plt.ylabel('得分')
Step3: 基于线性回归和随机搜索法的利率确定
# 随机搜索法求解利息最大时的利率
P = pd.read_excel(r'C:\Users\Dave Young\Desktop\C\贷款额度.xlsx',header = None).values
def scope(k):
s = 0 # while循环计数工具
N = 1000 # while循环终止计数条件
i = 1 # 索引计数工具
fx = [] # 用于存储搜索过的函数值
min_fx = [] # 用于储存fx的最小值
r = np.random.random()
while s
obj = P[k,1]*r*(1-c_x*r-c_b)
fx.append(obj[0,0])
min_fx.append(min(fx))
s = s+1
if i == 1:
r = r0
else:
if min_fx[i-1]
s = 0
else:
s = s+1
i = i+1
#fx_min = min_fx[-1]
return r
r_store = [] # 用于存储各企业的利率
fxmin_store = [] # 存储利息最小值
for k in range(123):
r = scope(k)
r_store.append(r)
r_store = pd.DataFrame(r_store)
r_store.to_excel('利率表.xlsx')
Step4: 逻辑回归与BPAdaboost网络确定信誉等级和是否放贷
rate = pd.read_excel(r'C:\Users\Dave Young\Desktop\C\rate.xlsx').iloc[:,[1]]
total_in = pd.read_excel(r'C:\Users\Dave Young\Desktop\C\data1.xlsx').iloc[:,[1]]
total_out = pd.read_excel(r'C:\Users\Dave Young\Desktop\C\data2.xlsx').iloc[:,[1]]
total_dis = pd.read_excel(r'C:\Users\Dave Young\Desktop\C\data4.xlsx').iloc[:,[1]]
total = np.hstack((rate,total_in,total_out,total_dis))
LR_test_code = [10,20,30,50,70,90,100,110,120,122] # 测试集代号
LR_train_code = [] # 训练集代号
for i in range(123):
if i in LR_test_code:
continue
else:
LR_train_code.append(i)
x = total[LR_train_code,[1,2,3]]
y = total[LR_train_code,[0]]
x1 = total[LR_test_code,[1,2,3]]
y1 = total[LR_test_code,[0]]
from sklearn.linear_model import LogisticRegression as LR
lr = LR()
lr.fit(x,y)
r = lr.score(x,y)
R = lr.predict(x1)
R = pd.DataFrame(R)
R = R.iloc[:,[0]]
# BPAdaboost神经网络预测
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
rate = pd.read_excel(r'C:\Users\Dave Young\Desktop\C\rate.xlsx').iloc[:,[1]]
total_in = pd.read_excel(r'C:\Users\Dave Young\Desktop\C\data1.xlsx').iloc[:,[1]]
total_out = pd.read_excel(r'C:\Users\Dave Young\Desktop\C\data2.xlsx').iloc[:,[1]]
total_dis = pd.read_excel(r'C:\Users\Dave Young\Desktop\C\data4.xlsx').iloc[:,[1]]
total_waste = pd.read_excel(r'C:\Users\Dave Young\Desktop\C\waste.xlsx').iloc[:,[1]]
total = np.hstack((rate,total_in,total_out,total_dis,total_waste))
LR_test_code = [10,20,30,50,70,90,100,110,120,122] # 测试集代号
LR_train_code = [] # 训练集代号
for i in range(123):
if i in LR_test_code:
continue
else:
LR_train_code.append(i)
x = total[LR_train_code,1:]
y = total[LR_train_code,[0]]
x1 = total[LR_test_code,1:]
y1 = total[LR_test_code,[0]]
AB = AdaBoostClassifier(n_estimators=1000)
AB.fit(x,y)
predict_results = AB.predict(x1)
print(accuracy_score(predict_results, y1))
conf_mat = confusion_matrix(y1, predict_results)
print(conf_mat)
print(classification_report(y1, predict_results))
# 预测
total_in2 = pd.read_excel(r'C:\Users\Dave Young\Desktop\C\data11.xlsx').iloc[:,[1]]
total_out2 = pd.read_excel(r'C:\Users\Dave Young\Desktop\C\data22.xlsx').iloc[:,[1]]
total_dis2 = pd.read_excel(r'C:\Users\Dave Young\Desktop\C\data33.xlsx').iloc[:,[1]]
total_waste2 = pd.read_excel(r'C:\Users\Dave Young\Desktop\C\waste_problem2.xlsx').iloc[:,[1]]
x2 = np.hstack((total_in2,total_out2,total_dis2,total_waste2))
predict_result_total = AB.predict(x2)
wanted = pd.DataFrame(predict_result_total)
wanted.to_excel('预测结果.xlsx')
Step5: 传染病模型确定受疫情影响的企业比例
# 通过传染病模型模拟一段时间后动态平衡时收到疫情影响的企业比例
N = 24 # 总人数
T = 70 # 模拟70天的变化情况
s = np.zeros([T]) # 易感者者的比例随天数的变化矩阵
i = np.zeros([T]) # 感染者的比例随天数的变化矩阵
lamda = 0.8 # 感染率
gamma = 0.2 # 治疗率
i[0] = 4.0 / N # 初始化感染者比例,初始化4家收到疫情影响的企业
s[0] = 1-i[0] # 初始化易感者比例,剩下的所有企业都是易受影响的企业
for t in range(T-1):
i[t+1] = i[t] + i[t] * (1- i[t])* lamda - i[t] * gamma
s[t+1] = 1 - i[t+1]
# 绘图
plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False
fig, ax = plt.subplots(figsize=(8,4))
ax.plot(i, c='r')
ax.plot(s, c='b')
ax.set_xlabel('天数',fontsize=10)
ax.set_ylabel('受疫情比例', fontsize=10)
ax.grid(1)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)