数据清洗

import numpy as np


def get_repeated_label(array_1dim):
    label_1dim=[]
    for k in range(len(array_1dim)-1):
        for i in range(k+1,len(array_1dim),1):
            if array_1dim[k]==array_1dim[i]:
                label_1dim.append(i)
    return np.unique(label_1dim)



 def invalid_value_cleaning (array_variable):
    for i in range(np.shape(array_variable)[0]):
        for j in range(np.shape(array_variable)[1]):
            if type(array_variable[i,j])==float or type(array_variable[i,j])==int:
                pass
            else:
                array_variable[i,j]=0
    array_variable[np.where(np.isnan(array_variable.astype(np.float64)))]=0
    return array_variable


def fillup_mean_value(array_parameter):
    for index_j in range(np.shape(array_parameter)[1]):
        array_columns_nr=list(np.where(array_parameter==0)[1])
        if index_j in array_columns_nr:
            appearance_sum=array_columns_nr.count(index_j)
            if appearance_sum>0:
                mean_1=(array_parameter[:,index_j]).sum()/(np.shape(array_parameter)[0]-appearance_sum)
                bollmatrix_1=array_parameter[:,index_j]==0
                array_parameter[bollmatrix_1,index_j]=mean_1
   
                return array_parameter
            

def delete_rows(list_variable,array_variable):
    i=0
    list_variable.sort()
    array_medium=array_variable
    for j in list_variable:
        if i==0:
            array_medium=np.delete(array_medium,j,0)
        else:
            array_medium=np.delete(array_medium,j-i,0)
            i=i+1
        return array_medium
    

import xlrd

name=r'Desktop\salesdetails.xlsx'

information_1=xlrd.open_workbook(name)

information_2 = information_1.sheet_by_name(u'Sheet1')

dataload_1=[]

nrows = information_2.nrows

for j in range(1,nrows,1):
    dataload_1.append(list(information_2.row_values(j)))
    

Data_Deal_with=np.array(dataload_1,dtype=np.object)

Data_Deal_with
Out[21]: 
array([[20170323.0, 126.0, 229.0, 159.0, 134300.0],
       [20170324.0, 128.0, 175.0, 111.0, 221.0],
       [20170325.0, 117.0, 77.0, 40.0, 171.0],
       [20170326.0, 93.0, 30.0, 28.0, 106.0],
       [20170327.0, 73.0, 46.0, 30.0, 123.0],
       [20170328.0, '¥$#', 28.0, 16.0, 74.0],
       [20170329.0, 46.0, 13.0, 10.0, 50.0],
       [20170330.0, 32.0, 11.0, 19.0, 30.0],
       [20170331.0, 25.0, 70000.0, 11.0, 53.0],
       [20170401.0, 41.0, 10.0, 16.0, 27.0],
       [20170401.0, 41.0, 10.0, 16.0, 27.0],
       [20170402.0, 21.0, 166.0, 66.0, 24.0],
       [20170403.0, 60.0, 67.0, 275.0, 17.0],
       [20170404.0, 74.0, 46.0, 51.0, 18.0],
       [20170405.0, 76.0, 25.0, 42.0, 12.0],
       [20170406.0, 54.0, 32.0, '', 11.0],
       [20170407.0, 186.0, 18.0, 18.0, 97700.0],
       [20170408.0, 121.0, 'sales ist Null', 23.0, 440.0],
       [20170409.0, 83.0, 17.0, 16.0, 107.0],
       [20170410.0, 87.0, 12.0, 15.0, 117.0],
       [20170411.0, 35.0, 16.0, 22.0, 62.0],
       [20170411.0, 35.0, 16.0, 22.0, 62.0],
       [20170412.0, '', 23.0, 15.0, 128.0],
       [20170413.0, 25.0, 17.0, 17.0, ''],
       [20170414.0, 24.0, 27.0, '', 30.0],
       [20170415.0, 13.0, 36.0, 12.0, 65.0],
       [20170415.0, 13.0, 36.0, 12.0, 65.0],
       [20170416.0, 9.0, 45.0, 147.0, 22.0],
       [20170417.0, 14.0, 54.0, 22.0, 22.0],
       [20170418.0, 16.0, 21.0, 4.0, 15.0],
       [20170419.0, 8.0, 13.0, 4.0, 23.0],
       [20170420.0, '%$', 59.0, 32452.0, 11.0],
       [20170421.0, 6.0, 89.0, 4.0, 13.0],
       [20170422.0, 5.0, 76.0, 31.0, 11.0]], dtype=object)

Columns_1=Data_Deal_with[:,0]

Columns_1
Out[23]: 
array([20170323.0, 20170324.0, 20170325.0, 20170326.0, 20170327.0,
       20170328.0, 20170329.0, 20170330.0, 20170331.0, 20170401.0,
       20170401.0, 20170402.0, 20170403.0, 20170404.0, 20170405.0,
       20170406.0, 20170407.0, 20170408.0, 20170409.0, 20170410.0,
       20170411.0, 20170411.0, 20170412.0, 20170413.0, 20170414.0,
       20170415.0, 20170415.0, 20170416.0, 20170417.0, 20170418.0,
       20170419.0, 20170420.0, 20170421.0, 20170422.0], dtype=object)

get_repeated_label(Columns_1)
Out[24]: array([10, 21, 26])

delete_rows(get_repeated_label(Columns_1),Data_Deal_with)
Out[25]: 
array([[20170323.0, 126.0, 229.0, 159.0, 134300.0],
       [20170324.0, 128.0, 175.0, 111.0, 221.0],
       [20170325.0, 117.0, 77.0, 40.0, 171.0],
       [20170326.0, 93.0, 30.0, 28.0, 106.0],
       [20170327.0, 73.0, 46.0, 30.0, 123.0],
       [20170328.0, '¥$#', 28.0, 16.0, 74.0],
       [20170329.0, 46.0, 13.0, 10.0, 50.0],
       [20170330.0, 32.0, 11.0, 19.0, 30.0],
       [20170331.0, 25.0, 70000.0, 11.0, 53.0],
       [20170401.0, 41.0, 10.0, 16.0, 27.0],
       [20170402.0, 21.0, 166.0, 66.0, 24.0],
       [20170403.0, 60.0, 67.0, 275.0, 17.0],
       [20170404.0, 74.0, 46.0, 51.0, 18.0],
       [20170405.0, 76.0, 25.0, 42.0, 12.0],
       [20170406.0, 54.0, 32.0, '', 11.0],
       [20170407.0, 186.0, 18.0, 18.0, 97700.0],
       [20170408.0, 121.0, 'sales ist Null', 23.0, 440.0],
       [20170409.0, 83.0, 17.0, 16.0, 107.0],
       [20170410.0, 87.0, 12.0, 15.0, 117.0],
       [20170411.0, 35.0, 16.0, 22.0, 62.0],
       [20170411.0, 35.0, 16.0, 22.0, 62.0],
       [20170412.0, '', 23.0, 15.0, 128.0],
       [20170413.0, 25.0, 17.0, 17.0, ''],
       [20170414.0, 24.0, 27.0, '', 30.0],
       [20170415.0, 13.0, 36.0, 12.0, 65.0],
       [20170415.0, 13.0, 36.0, 12.0, 65.0],
       [20170416.0, 9.0, 45.0, 147.0, 22.0],
       [20170417.0, 14.0, 54.0, 22.0, 22.0],
       [20170418.0, 16.0, 21.0, 4.0, 15.0],
       [20170419.0, 8.0, 13.0, 4.0, 23.0],
       [20170420.0, '%$', 59.0, 32452.0, 11.0],
       [20170421.0, 6.0, 89.0, 4.0, 13.0],
       [20170422.0, 5.0, 76.0, 31.0, 11.0]], dtype=object)

invalid_value_cleaning(Data_Deal_with)
Out[27]: 
array([[20170323.0, 126.0, 229.0, 159.0, 134300.0],
       [20170324.0, 128.0, 175.0, 111.0, 221.0],
       [20170325.0, 117.0, 77.0, 40.0, 171.0],
       [20170326.0, 93.0, 30.0, 28.0, 106.0],
       [20170327.0, 73.0, 46.0, 30.0, 123.0],
       [20170328.0, 0, 28.0, 16.0, 74.0],
       [20170329.0, 46.0, 13.0, 10.0, 50.0],
       [20170330.0, 32.0, 11.0, 19.0, 30.0],
       [20170331.0, 25.0, 70000.0, 11.0, 53.0],
       [20170401.0, 41.0, 10.0, 16.0, 27.0],
       [20170401.0, 41.0, 10.0, 16.0, 27.0],
       [20170402.0, 21.0, 166.0, 66.0, 24.0],
       [20170403.0, 60.0, 67.0, 275.0, 17.0],
       [20170404.0, 74.0, 46.0, 51.0, 18.0],
       [20170405.0, 76.0, 25.0, 42.0, 12.0],
       [20170406.0, 54.0, 32.0, 0, 11.0],
       [20170407.0, 186.0, 18.0, 18.0, 97700.0],
       [20170408.0, 121.0, 0, 23.0, 440.0],
       [20170409.0, 83.0, 17.0, 16.0, 107.0],
       [20170410.0, 87.0, 12.0, 15.0, 117.0],
       [20170411.0, 35.0, 16.0, 22.0, 62.0],
       [20170411.0, 35.0, 16.0, 22.0, 62.0],
       [20170412.0, 0, 23.0, 15.0, 128.0],
       [20170413.0, 25.0, 17.0, 17.0, 0],
       [20170414.0, 24.0, 27.0, 0, 30.0],
       [20170415.0, 13.0, 36.0, 12.0, 65.0],
       [20170415.0, 13.0, 36.0, 12.0, 65.0],
       [20170416.0, 9.0, 45.0, 147.0, 22.0],
       [20170417.0, 14.0, 54.0, 22.0, 22.0],
       [20170418.0, 16.0, 21.0, 4.0, 15.0],
       [20170419.0, 8.0, 13.0, 4.0, 23.0],
       [20170420.0, 0, 59.0, 32452.0, 11.0],
       [20170421.0, 6.0, 89.0, 4.0, 13.0],
       [20170422.0, 5.0, 76.0, 31.0, 11.0]], dtype=object)

Indormation_after_handling=invalid_value_cleaning(Data_Deal_with)

fillup_mean_value(Indormation_after_handling)
Out[29]: 
array([[20170323.0, 126.0, 229.0, 159.0, 134300.0],
       [20170324.0, 128.0, 175.0, 111.0, 221.0],
       [20170325.0, 117.0, 77.0, 40.0, 171.0],
       [20170326.0, 93.0, 30.0, 28.0, 106.0],
       [20170327.0, 73.0, 46.0, 30.0, 123.0],
       [20170328.0, 54.41935483870968, 28.0, 16.0, 74.0],
       [20170329.0, 46.0, 13.0, 10.0, 50.0],
       [20170330.0, 32.0, 11.0, 19.0, 30.0],
       [20170331.0, 25.0, 70000.0, 11.0, 53.0],
       [20170401.0, 41.0, 10.0, 16.0, 27.0],
       [20170401.0, 41.0, 10.0, 16.0, 27.0],
       [20170402.0, 21.0, 166.0, 66.0, 24.0],
       [20170403.0, 60.0, 67.0, 275.0, 17.0],
       [20170404.0, 74.0, 46.0, 51.0, 18.0],
       [20170405.0, 76.0, 25.0, 42.0, 12.0],
       [20170406.0, 54.0, 32.0, 0, 11.0],
       [20170407.0, 186.0, 18.0, 18.0, 97700.0],
       [20170408.0, 121.0, 0, 23.0, 440.0],
       [20170409.0, 83.0, 17.0, 16.0, 107.0],
       [20170410.0, 87.0, 12.0, 15.0, 117.0],
       [20170411.0, 35.0, 16.0, 22.0, 62.0],
       [20170411.0, 35.0, 16.0, 22.0, 62.0],
       [20170412.0, 54.41935483870968, 23.0, 15.0, 128.0],
       [20170413.0, 25.0, 17.0, 17.0, 0],
       [20170414.0, 24.0, 27.0, 0, 30.0],
       [20170415.0, 13.0, 36.0, 12.0, 65.0],
       [20170415.0, 13.0, 36.0, 12.0, 65.0],
       [20170416.0, 9.0, 45.0, 147.0, 22.0],
       [20170417.0, 14.0, 54.0, 22.0, 22.0],
       [20170418.0, 16.0, 21.0, 4.0, 15.0],
       [20170419.0, 8.0, 13.0, 4.0, 23.0],
       [20170420.0, 54.41935483870968, 59.0, 32452.0, 11.0],
       [20170421.0, 6.0, 89.0, 4.0, 13.0],
       [20170422.0, 5.0, 76.0, 31.0, 11.0]], dtype=object)

DataFormat=fillup_mean_value(Indormation_after_handling)

import pandas as pd

Format_DataFrame=pd.DataFrame(DataFormat)

format_array_Frame=lambda x:'%u' % x

Format_DataFrame.applymap(format_array_Frame)
Out[35]: 
           0    1      2      3       4
0   20170323  126    229    159  134300
1   20170324  128    175    111     221
2   20170325  117     77     40     171
3   20170326   93     30     28     106
4   20170327   73     46     30     123
5   20170328   54     28     16      74
6   20170329   46     13     10      50
7   20170330   32     11     19      30
8   20170331   25  70000     11      53
9   20170401   41     10     16      27
10  20170401   41     10     16      27
11  20170402   21    166     66      24
12  20170403   60     67    275      17
13  20170404   74     46     51      18
14  20170405   76     25     42      12
15  20170406   54     32      0      11
16  20170407  186     18     18   97700
17  20170408  121   2167     23     440
18  20170409   83     17     16     107
19  20170410   87     12     15     117
20  20170411   35     16     22      62
21  20170411   35     16     22      62
22  20170412   54     23     15     128
23  20170413   25     17     17       0
24  20170414   24     27      0      30
25  20170415   13     36     12      65
26  20170415   13     36     12      65
27  20170416    9     45    147      22
28  20170417   14     54     22      22
29  20170418   16     21      4      15
30  20170419    8     13      4      23
31  20170420   54     59  32452      11
32  20170421    6     89      4      13
33  20170422    5     76     31      11

你可能感兴趣的:(数据分析)