import numpy as np
def get_repeated_label(array_1dim):
label_1dim=[]
for k in range(len(array_1dim)-1):
for i in range(k+1,len(array_1dim),1):
if array_1dim[k]==array_1dim[i]:
label_1dim.append(i)
return np.unique(label_1dim)
def invalid_value_cleaning (array_variable):
for i in range(np.shape(array_variable)[0]):
for j in range(np.shape(array_variable)[1]):
if type(array_variable[i,j])==float or type(array_variable[i,j])==int:
pass
else:
array_variable[i,j]=0
array_variable[np.where(np.isnan(array_variable.astype(np.float64)))]=0
return array_variable
def fillup_mean_value(array_parameter):
for index_j in range(np.shape(array_parameter)[1]):
array_columns_nr=list(np.where(array_parameter==0)[1])
if index_j in array_columns_nr:
appearance_sum=array_columns_nr.count(index_j)
if appearance_sum>0:
mean_1=(array_parameter[:,index_j]).sum()/(np.shape(array_parameter)[0]-appearance_sum)
bollmatrix_1=array_parameter[:,index_j]==0
array_parameter[bollmatrix_1,index_j]=mean_1
return array_parameter
def delete_rows(list_variable,array_variable):
i=0
list_variable.sort()
array_medium=array_variable
for j in list_variable:
if i==0:
array_medium=np.delete(array_medium,j,0)
else:
array_medium=np.delete(array_medium,j-i,0)
i=i+1
return array_medium
import xlrd
name=r'Desktop\salesdetails.xlsx'
information_1=xlrd.open_workbook(name)
information_2 = information_1.sheet_by_name(u'Sheet1')
dataload_1=[]
nrows = information_2.nrows
for j in range(1,nrows,1):
dataload_1.append(list(information_2.row_values(j)))
Data_Deal_with=np.array(dataload_1,dtype=np.object)
Data_Deal_with
Out[21]:
array([[20170323.0, 126.0, 229.0, 159.0, 134300.0],
[20170324.0, 128.0, 175.0, 111.0, 221.0],
[20170325.0, 117.0, 77.0, 40.0, 171.0],
[20170326.0, 93.0, 30.0, 28.0, 106.0],
[20170327.0, 73.0, 46.0, 30.0, 123.0],
[20170328.0, '¥$#', 28.0, 16.0, 74.0],
[20170329.0, 46.0, 13.0, 10.0, 50.0],
[20170330.0, 32.0, 11.0, 19.0, 30.0],
[20170331.0, 25.0, 70000.0, 11.0, 53.0],
[20170401.0, 41.0, 10.0, 16.0, 27.0],
[20170401.0, 41.0, 10.0, 16.0, 27.0],
[20170402.0, 21.0, 166.0, 66.0, 24.0],
[20170403.0, 60.0, 67.0, 275.0, 17.0],
[20170404.0, 74.0, 46.0, 51.0, 18.0],
[20170405.0, 76.0, 25.0, 42.0, 12.0],
[20170406.0, 54.0, 32.0, '', 11.0],
[20170407.0, 186.0, 18.0, 18.0, 97700.0],
[20170408.0, 121.0, 'sales ist Null', 23.0, 440.0],
[20170409.0, 83.0, 17.0, 16.0, 107.0],
[20170410.0, 87.0, 12.0, 15.0, 117.0],
[20170411.0, 35.0, 16.0, 22.0, 62.0],
[20170411.0, 35.0, 16.0, 22.0, 62.0],
[20170412.0, '', 23.0, 15.0, 128.0],
[20170413.0, 25.0, 17.0, 17.0, ''],
[20170414.0, 24.0, 27.0, '', 30.0],
[20170415.0, 13.0, 36.0, 12.0, 65.0],
[20170415.0, 13.0, 36.0, 12.0, 65.0],
[20170416.0, 9.0, 45.0, 147.0, 22.0],
[20170417.0, 14.0, 54.0, 22.0, 22.0],
[20170418.0, 16.0, 21.0, 4.0, 15.0],
[20170419.0, 8.0, 13.0, 4.0, 23.0],
[20170420.0, '%$', 59.0, 32452.0, 11.0],
[20170421.0, 6.0, 89.0, 4.0, 13.0],
[20170422.0, 5.0, 76.0, 31.0, 11.0]], dtype=object)
Columns_1=Data_Deal_with[:,0]
Columns_1
Out[23]:
array([20170323.0, 20170324.0, 20170325.0, 20170326.0, 20170327.0,
20170328.0, 20170329.0, 20170330.0, 20170331.0, 20170401.0,
20170401.0, 20170402.0, 20170403.0, 20170404.0, 20170405.0,
20170406.0, 20170407.0, 20170408.0, 20170409.0, 20170410.0,
20170411.0, 20170411.0, 20170412.0, 20170413.0, 20170414.0,
20170415.0, 20170415.0, 20170416.0, 20170417.0, 20170418.0,
20170419.0, 20170420.0, 20170421.0, 20170422.0], dtype=object)
get_repeated_label(Columns_1)
Out[24]: array([10, 21, 26])
delete_rows(get_repeated_label(Columns_1),Data_Deal_with)
Out[25]:
array([[20170323.0, 126.0, 229.0, 159.0, 134300.0],
[20170324.0, 128.0, 175.0, 111.0, 221.0],
[20170325.0, 117.0, 77.0, 40.0, 171.0],
[20170326.0, 93.0, 30.0, 28.0, 106.0],
[20170327.0, 73.0, 46.0, 30.0, 123.0],
[20170328.0, '¥$#', 28.0, 16.0, 74.0],
[20170329.0, 46.0, 13.0, 10.0, 50.0],
[20170330.0, 32.0, 11.0, 19.0, 30.0],
[20170331.0, 25.0, 70000.0, 11.0, 53.0],
[20170401.0, 41.0, 10.0, 16.0, 27.0],
[20170402.0, 21.0, 166.0, 66.0, 24.0],
[20170403.0, 60.0, 67.0, 275.0, 17.0],
[20170404.0, 74.0, 46.0, 51.0, 18.0],
[20170405.0, 76.0, 25.0, 42.0, 12.0],
[20170406.0, 54.0, 32.0, '', 11.0],
[20170407.0, 186.0, 18.0, 18.0, 97700.0],
[20170408.0, 121.0, 'sales ist Null', 23.0, 440.0],
[20170409.0, 83.0, 17.0, 16.0, 107.0],
[20170410.0, 87.0, 12.0, 15.0, 117.0],
[20170411.0, 35.0, 16.0, 22.0, 62.0],
[20170411.0, 35.0, 16.0, 22.0, 62.0],
[20170412.0, '', 23.0, 15.0, 128.0],
[20170413.0, 25.0, 17.0, 17.0, ''],
[20170414.0, 24.0, 27.0, '', 30.0],
[20170415.0, 13.0, 36.0, 12.0, 65.0],
[20170415.0, 13.0, 36.0, 12.0, 65.0],
[20170416.0, 9.0, 45.0, 147.0, 22.0],
[20170417.0, 14.0, 54.0, 22.0, 22.0],
[20170418.0, 16.0, 21.0, 4.0, 15.0],
[20170419.0, 8.0, 13.0, 4.0, 23.0],
[20170420.0, '%$', 59.0, 32452.0, 11.0],
[20170421.0, 6.0, 89.0, 4.0, 13.0],
[20170422.0, 5.0, 76.0, 31.0, 11.0]], dtype=object)
invalid_value_cleaning(Data_Deal_with)
Out[27]:
array([[20170323.0, 126.0, 229.0, 159.0, 134300.0],
[20170324.0, 128.0, 175.0, 111.0, 221.0],
[20170325.0, 117.0, 77.0, 40.0, 171.0],
[20170326.0, 93.0, 30.0, 28.0, 106.0],
[20170327.0, 73.0, 46.0, 30.0, 123.0],
[20170328.0, 0, 28.0, 16.0, 74.0],
[20170329.0, 46.0, 13.0, 10.0, 50.0],
[20170330.0, 32.0, 11.0, 19.0, 30.0],
[20170331.0, 25.0, 70000.0, 11.0, 53.0],
[20170401.0, 41.0, 10.0, 16.0, 27.0],
[20170401.0, 41.0, 10.0, 16.0, 27.0],
[20170402.0, 21.0, 166.0, 66.0, 24.0],
[20170403.0, 60.0, 67.0, 275.0, 17.0],
[20170404.0, 74.0, 46.0, 51.0, 18.0],
[20170405.0, 76.0, 25.0, 42.0, 12.0],
[20170406.0, 54.0, 32.0, 0, 11.0],
[20170407.0, 186.0, 18.0, 18.0, 97700.0],
[20170408.0, 121.0, 0, 23.0, 440.0],
[20170409.0, 83.0, 17.0, 16.0, 107.0],
[20170410.0, 87.0, 12.0, 15.0, 117.0],
[20170411.0, 35.0, 16.0, 22.0, 62.0],
[20170411.0, 35.0, 16.0, 22.0, 62.0],
[20170412.0, 0, 23.0, 15.0, 128.0],
[20170413.0, 25.0, 17.0, 17.0, 0],
[20170414.0, 24.0, 27.0, 0, 30.0],
[20170415.0, 13.0, 36.0, 12.0, 65.0],
[20170415.0, 13.0, 36.0, 12.0, 65.0],
[20170416.0, 9.0, 45.0, 147.0, 22.0],
[20170417.0, 14.0, 54.0, 22.0, 22.0],
[20170418.0, 16.0, 21.0, 4.0, 15.0],
[20170419.0, 8.0, 13.0, 4.0, 23.0],
[20170420.0, 0, 59.0, 32452.0, 11.0],
[20170421.0, 6.0, 89.0, 4.0, 13.0],
[20170422.0, 5.0, 76.0, 31.0, 11.0]], dtype=object)
Indormation_after_handling=invalid_value_cleaning(Data_Deal_with)
fillup_mean_value(Indormation_after_handling)
Out[29]:
array([[20170323.0, 126.0, 229.0, 159.0, 134300.0],
[20170324.0, 128.0, 175.0, 111.0, 221.0],
[20170325.0, 117.0, 77.0, 40.0, 171.0],
[20170326.0, 93.0, 30.0, 28.0, 106.0],
[20170327.0, 73.0, 46.0, 30.0, 123.0],
[20170328.0, 54.41935483870968, 28.0, 16.0, 74.0],
[20170329.0, 46.0, 13.0, 10.0, 50.0],
[20170330.0, 32.0, 11.0, 19.0, 30.0],
[20170331.0, 25.0, 70000.0, 11.0, 53.0],
[20170401.0, 41.0, 10.0, 16.0, 27.0],
[20170401.0, 41.0, 10.0, 16.0, 27.0],
[20170402.0, 21.0, 166.0, 66.0, 24.0],
[20170403.0, 60.0, 67.0, 275.0, 17.0],
[20170404.0, 74.0, 46.0, 51.0, 18.0],
[20170405.0, 76.0, 25.0, 42.0, 12.0],
[20170406.0, 54.0, 32.0, 0, 11.0],
[20170407.0, 186.0, 18.0, 18.0, 97700.0],
[20170408.0, 121.0, 0, 23.0, 440.0],
[20170409.0, 83.0, 17.0, 16.0, 107.0],
[20170410.0, 87.0, 12.0, 15.0, 117.0],
[20170411.0, 35.0, 16.0, 22.0, 62.0],
[20170411.0, 35.0, 16.0, 22.0, 62.0],
[20170412.0, 54.41935483870968, 23.0, 15.0, 128.0],
[20170413.0, 25.0, 17.0, 17.0, 0],
[20170414.0, 24.0, 27.0, 0, 30.0],
[20170415.0, 13.0, 36.0, 12.0, 65.0],
[20170415.0, 13.0, 36.0, 12.0, 65.0],
[20170416.0, 9.0, 45.0, 147.0, 22.0],
[20170417.0, 14.0, 54.0, 22.0, 22.0],
[20170418.0, 16.0, 21.0, 4.0, 15.0],
[20170419.0, 8.0, 13.0, 4.0, 23.0],
[20170420.0, 54.41935483870968, 59.0, 32452.0, 11.0],
[20170421.0, 6.0, 89.0, 4.0, 13.0],
[20170422.0, 5.0, 76.0, 31.0, 11.0]], dtype=object)
DataFormat=fillup_mean_value(Indormation_after_handling)
import pandas as pd
Format_DataFrame=pd.DataFrame(DataFormat)
format_array_Frame=lambda x:'%u' % x
Format_DataFrame.applymap(format_array_Frame)
Out[35]:
0 1 2 3 4
0 20170323 126 229 159 134300
1 20170324 128 175 111 221
2 20170325 117 77 40 171
3 20170326 93 30 28 106
4 20170327 73 46 30 123
5 20170328 54 28 16 74
6 20170329 46 13 10 50
7 20170330 32 11 19 30
8 20170331 25 70000 11 53
9 20170401 41 10 16 27
10 20170401 41 10 16 27
11 20170402 21 166 66 24
12 20170403 60 67 275 17
13 20170404 74 46 51 18
14 20170405 76 25 42 12
15 20170406 54 32 0 11
16 20170407 186 18 18 97700
17 20170408 121 2167 23 440
18 20170409 83 17 16 107
19 20170410 87 12 15 117
20 20170411 35 16 22 62
21 20170411 35 16 22 62
22 20170412 54 23 15 128
23 20170413 25 17 17 0
24 20170414 24 27 0 30
25 20170415 13 36 12 65
26 20170415 13 36 12 65
27 20170416 9 45 147 22
28 20170417 14 54 22 22
29 20170418 16 21 4 15
30 20170419 8 13 4 23
31 20170420 54 59 32452 11
32 20170421 6 89 4 13
33 20170422 5 76 31 11