Numpy/Pandas均值处理数据缺失值

# -*- coding: utf-8 -*-
#-----------------------------------------------------------------------------------------------------------------------
__Author__ = 'assasin'
__DateTime__ = '2020/1/5 15:13'
#-----------------------------------------------------------------------------------------------------------------------

'''
处理数据缺失值
Numpy均值处理数据缺失值
Pandas加载数据,重构缺失数据矩阵
Pandas 实现均质填充
Pandas 处理缺失值: 标量法,丢失法,忽略法,前后法
'''

import numpy as np
import pandas as pd
from numpy import *

def loadDataSet(filepath,delim='\t'):
    fr = open(filepath)
    stringArr = [line.strip().split(delim) for line in fr.readlines()]
    #print(stringArr)
    dataArr = [list(map(float,line)) for line in stringArr]
    return mat(dataArr)


def replaceNanwithMean(dataArr):
    numfeat = shape(dataArr)
    for i in range(numfeat[1]-1):
        meanVal = mean(dataArr[nonzero((~isnan(dataArr[:,i].A))[0],i)])
        dataArr[nonzero(isnan(dataArr[:,i].A))[0],i] = meanVal

    return dataArr





if __name__ == '__main__':
    # 加载数据集
    dataArr = loadDataSet(r'../xxx.txt','    ')

    # 均值填充缺失值
    replaceNanwithMean(dataArr)

    datamat = loadDataSet(r'../xxx.txt','    ')
    df = pd.DataFrame(datamat)
    # 重构矩阵
    df = df.reindex(range(datamat.shape[0] + 5 ))
    # NAN 视为0
    loassVs = [df[col].mean()  for col in range(datamat.shape[1])]
    lists = [list(df[i].fillna(loassVs[i]))  for i in range(len(loassVs))]
    print(mat(lists).T)
    

 

你可能感兴趣的:(python,处理缺失值,均值填充)