拉格朗日插值法的直观表述可以参考:https://www.zhihu.com/question/58333118
在python中可以直接使用:
from scipy.interpolate import lagrange
import numpy as np
x = np.array([1, 2, 3, 4, 5])
y = np.array([10, 8, 4, 6, 2, 10])
lag_model = lagrange(x, y) # 根据x与y建模
print(lag_model)
print("when x=4, y=", lag_model(4)) # 模型预测x=4时的值
print("when x=6, y=", lag_model(6)) # 模型预测x=6时的值
打印的结果是:
4 3 2
-0.8333 x + 9.667 x - 38.17 x + 57.33 x - 18
when x=4, y= 5.999999999999915
when x=6, y= -40.00000000000037
打印lag_model
时,给出拟合的函数:-0.8333 x4 + 9.667 x3 - 38.17 x2 + 57.33 x - 18
但是根据模型拟合结果来看:
所以拉格朗日插值适合存在一定函数依赖的序列,常见于时间序列,如果是一般的采样数据预测偏差会较大
由于拉格朗日插值的原理是尝试用一个函数拟合给定的数据点,但是有时候数据点的x是随机的,真正有价值的是y,因此这里给定两种代码:
import pandas as pd
import numpy as np
def lagrange_fill(series, k=5):
"""拉格朗日填充"""
from scipy.interpolate import lagrange
def ployinterp_column(s, n, k=k):
up_index = list(range(n - k, n))
down_index = list(range(n + 1, n + 1 + k))
if n + 1 + k > series.shape[0]:
down_index = [_ - series.shape[0] for _ in down_index] # 防止结尾越界
y = s.iloc[up_index + down_index] # 取数
y = y[y.notnull()] # 剔除空值
return lagrange(y.index.tolist(), y.values)(n) # 插值并返回插值结果
# return lagrange(list(range(len(y))), list(y))(n) # 插值并返回插值结果
for i, index in enumerate(series[series.isnull() == True].index):
series[index] = ployinterp_column(series, index) # 返回当前数据的位置
return series
def get_dataset():
"""得到数据
:return data_x:有缺失值的数据
:return true_value:缺失数据的原始真实值
:return data_y:原问题中待预测的label
"""
import copy
from sklearn.datasets import make_classification
data_x, data_y = make_classification(n_samples=100, n_classes=4, n_features=6, n_informative=4,
random_state=0) # 6个特征
data_x = pd.DataFrame(data_x)
data_x.columns = ['x1', 'x2', 'x3', 'x4', 'x5', 'miss_line']
true_data = copy.deepcopy(data_x)
# 在miss_line这一列删除20%的数据
drop_index = data_x.sample(frac=0.2).index
data_x.loc[drop_index, "miss_line"] = np.nan
true_value = true_data.loc[drop_index, 'miss_line'] # 空值的真实值
return data_x, true_value, data_y
if __name__ == '__main__':
value_x, true_value_x, value_y = get_dataset()
fill_value = lagrange_fill(value_x['miss_line'])
value_x['miss_line'] = fill_value
import pandas as pd
import numpy as np
def lagrange_fill(series, k=5):
"""拉格朗日填充"""
from scipy.interpolate import lagrange
def ployinterp_column(s, n, k=k):
up_index = list(range(n - k, n))
down_index = list(range(n + 1, n + 1 + k))
if n + 1 + k > series.shape[0]:
down_index = [_ - series.shape[0] for _ in down_index] # 防止结尾越界
y = s.iloc[up_index + down_index] # 取数
y = y[y.notnull()] # 剔除空值
return lagrange(list(range(len(y))), list(y))(k) # 插值并返回插值结果
for i, index in enumerate(series[series.isnull() == True].index):
series[index] = ployinterp_column(series, index) # 返回当前数据的位置
return series
def get_dataset():
"""得到数据
:return data_x:有缺失值的数据
:return true_value:缺失数据的原始真实值
:return data_y:原问题中待预测的label
"""
import copy
from sklearn.datasets import make_classification
data_x, data_y = make_classification(n_samples=100, n_classes=4, n_features=6, n_informative=4,
random_state=0) # 6个特征
data_x = pd.DataFrame(data_x)
data_x.columns = ['x1', 'x2', 'x3', 'x4', 'x5', 'miss_line']
true_data = copy.deepcopy(data_x)
# 在miss_line这一列删除20%的数据
drop_index = data_x.sample(frac=0.2).index
data_x.loc[drop_index, "miss_line"] = np.nan
true_value = true_data.loc[drop_index, 'miss_line'] # 空值的真实值
return data_x, true_value, data_y
if __name__ == '__main__':
value_x, true_value_x, value_y = get_dataset()
fill_value = lagrange_fill(value_x['miss_line'])
value_x['miss_line'] = fill_value