MIC是最大信息系数,可以参考:
计算相对比较复杂,数据量大的话消耗的时间也较长,MIC计算可以参考的库有:
这里使用多进程加速计算,安装minepy
pip install minepy
然后进行多进程运算即可
import pandas as pd
import numpy as np
from minepy import MINE
from joblib import Parallel, delayed
def cal_mic(x, y):
m = MINE()
m.compute_score(x, y)
return m.mic()
def multi_mic(dataframe: pd.DataFrame, value: np.ndarray, cpu_num=5):
job_list = []
for co_index, col_x in dataframe.iteritems():
job_list.append(delayed(cal_mic)(col_x, value))
multi_work = Parallel(n_jobs=cpu_num, backend='multiprocessing')
res = multi_work(job_list)
return res
if __name__ == '__main__':
df = pd.DataFrame(np.random.randn(2000, 10))
value_y = df[0]
multi_mic(df, value_y)