python中的scipy
库提供了拉格朗日差值法的函数,直接调用
案例
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.interpolate import lagrange
cat_sale = pd.read_excel('data/catering_sale.xls')
cat_sale.drop('日期', axis=1, inplace=True)
# 过滤异常值,并置为空值
cat_sale['销量'][(cat_sale['销量'] < 400) | (cat_sale['销量'] > 5000)] = np.NAN
# 自定义列向量插值函数
def ployinterp_columns(s, n, k=4):
if n < k:
y = s[list(range(0, n)) + list(range(n + 1, n + k + 1))]
elif n > len(s) - k - 1:
y = s[list(range(n - k, n)) + list(range(n + 1, len(s)))]
else:
y = s[list(range(n - k, n)) + list(range(n + 1, n + k + 1))] # 取空值处的前后3个数
y = y[y.notnull()] # 剔除空值
return lagrange(y.index, list(y))(n) # 插值并返回插值结果
# 逐个判断元素是否需要插值
for i in cat_sale.columns:
for j in range(len(cat_sale)):
if (cat_sale[i].isnull())[j]: # 如果为空则插值
print(ployinterp_columns(cat_sale[i], j))
cat_sale[i][j] = ployinterp_columns(cat_sale[i], j)
cat_sale.to_csv('sales.csv') # 输出结果,写入文件
import numpy as np
import pandas as pd
cat_sale = pd.read_excel('data/catering_sale.xls')
cat_sale.drop('日期', axis=1, inplace=True)
# 过滤异常值,并置为空值
cat_sale['销量'][(cat_sale['销量'] < 400) | (cat_sale['销量'] > 5000)] = np.NAN
# 分别定义求插商与求w的函数
def cal_f(x, y):
"""
计算插商
"""
f0 = np.zeros((len(x), len(y))) # 定义一个存储插商的数组
for k in range(len(y) + 1): # 遍历列
for i in range(k, len(x)): # 遍历行
if k == 0:
f0[i, k] = y[i]
else:
f0[i, k] = (f0[i, k - 1] - f0[i - 1, k - 1]) / (x[i] - x[i - 1])
print('差商表', '\n', f0)
return f0
def newton(x, y, x_j):
"""
牛顿差值多项式
"""
f0 = cal_f(x, y) # 计算插商
f0 = f0.diagonal()
# 与w相乘
f1 = 0
for i in range(len(f0)):
s = 1
k = 0
while k < i:
s = s * (x_j - x[k])
k += 1
f1 = f1 + f0[i] * s
return f1
# 自定义列向量插值函数
def ployinterp_columns(s, n, x_j, k=3):
if n < k:
y = s[list(range(0, n)) + list(range(n + 1, n + k + 1))]
elif n > len(s) - k - 1:
y = s[list(range(n - k, n)) + list(range(n + 1, len(s)))]
else:
y = s[list(range(n - k, n)) + list(range(n + 1, n + k + 1))] # 取空值处的前后5个数
y = y[y.notnull()] # 剔除空值
return newton(y.index, list(y), x_j) # 插值并返回插值结果
for i in cat_sale.columns:
for j in range(len(cat_sale)):
if (cat_sale[i].isnull())[j]:
x_j = cat_sale.index[j]
print(ployinterp_columns(cat_sale[i], j, x_j))
cat_sale[i][j] = ployinterp_columns(cat_sale[i], j, x_j)
cat_sale.to_excel('saless.xls')
k的选取非常非常重要!!!以拉格朗日差值为例,
k=1,填充的值为
2618.2
3902.2000000000007
2868.0499999999993
2844.5000000000146
2731.399999999994
2471.949999999986
k=2,填充的值为
2627.9999999999995
4077.716666666587
3291.2166666662088
3080.916666775942
2846.4166668355465
2474.0000002086163
k=3,填充的值为
2681.299999999999
4162.340000009164
3658.435000004247
3221.830780029297
2919.119171142578
2501.641655921936
k=4,填充的值为
1987.8999999999987
4224.922857132275
3940.338572591543
3313.03125
2977.59375
2498.1875
k=5,填充的值为
-291.4000000000001
4275.254762476077
4156.860423326492
96.0
6720.0
-75744.0
k=6,填充的值为
-4788.299999999997
4315.101515245042
4325.017848968506
10567680.0
-24801280.0
28934144.0
稍不注意,填充的值就为异常值!!!
如果对您有帮助,麻烦点赞关注,这真的对我很重要!!!如果需要互关,请评论留言!