import pandas
import numpy as np
columns = ["age", "workclass", "fnlwgt", "education", "education_num", "marital_status", "occupation", "relationship", "race", "sex",
"capital_gain", "capital_loss", "hours_per_week", "native_country", "high_income"]
income = pandas.read_csv("D:\\test\machineLearning\income.csv", names=columns)
print(income.head(2))
age workclass fnlwgt education education_num \
0 39 State-gov 77516 Bachelors 13
1 50 Self-emp-not-inc 83311 Bachelors 13
marital_status occupation relationship race sex \
0 Never-married Adm-clerical Not-in-family White Male
1 Married-civ-spouse Exec-managerial Husband White Male
capital_gain capital_loss hours_per_week native_country high_income
0 2174 0 40 United-States <=50K
1 0 0 13 United-States <=50K
print income["workclass"].unique()
[' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
' ?' ' Self-emp-inc']
col = pandas.Categorical.from_array(income["workclass"])
print col
print col.codes
[State-gov, Self-emp-not-inc, Private, Private, Private, ..., Private, Private, Private, Self-emp-not-inc, Private]
Length: 629
Categories (7, object): [?, Federal-gov, Local-gov, Private, Self-emp-inc, Self-emp-not-inc, State-gov]
[6 5 3 3 3 3 3 5 3 3 3 6 3 3 3 3 5 3 3 5 3 3 1 3 3 2 3 0 3 3 2 3 3 1 6 3 3
3 3 5 3 5 3 3 3 1 3 3 6 3 3 3 3 1 4 3 3 3 3 3 3 0 3 3 3 3 3 3 4 0 3 3 5 3
3 3 3 0 3 2 3 3 3 3 3 3 2 3 3 1 3 3 3 3 2 2 5 3 3 1 3 3 5 3 3 4 0 3 2 3 3
3 5 3 3 3 4 2 3 3 3 3 3 6 3 3 3 3 0 3 3 3 5 3 3 1 5 3 3 3 4 3 3 3 3 3 3 3
3 0 2 3 3 3 0 3 3 5 3 3 0 3 4 3 5 3 1 6 3 2 3 6 3 3 6 3 3 3 3 3 2 3 3 3 1
3 5 0 3 6 3 3 2 3 1 3 3 1 3 5 3 0 3 2 6 3 3 3 4 3 2 3 3 3 4 3 3 3 3 3 3 0
3 3 2 3 0 5 3 3 3 3 3 3 3 6 6 3 2 3 3 5 6 0 3 3 3 3 3 3 3 3 2 2 3 5 3 3 3
3 3 3 3 3 3 4 0 3 3 3 3 3 5 2 3 3 3 3 3 3 3 2 3 3 3 1 3 3 3 3 3 3 2 3 3 3
4 0 3 3 3 3 3 3 3 3 3 3 3 4 3 3 0 5 3 3 5 3 3 3 5 3 3 3 5 5 0 3 5 3 3 3 3
2 3 3 3 3 3 3 3 3 6 3 6 5 0 0 3 3 3 3 3 2 0 1 3 3 3 5 3 3 3 3 3 3 5 3 3 3
1 3 6 3 3 2 3 3 5 3 3 3 3 3 2 3 3 3 3 3 3 3 5 6 3 3 3 0 3 4 3 3 6 3 3 3 3
5 0 3 2 3 3 3 6 3 3 3 2 6 3 3 6 3 3 3 1 3 4 2 0 0 5 3 3 3 2 2 3 2 3 3 1 3
3 3 3 3 3 0 3 4 3 3 5 3 3 3 3 0 3 2 3 3 3 3 3 3 3 3 3 0 3 3 3 3 3 3 3 3 3
5 6 3 0 3 0 2 2 3 3 1 3 3 3 3 3 3 3 0 2 2 3 3 3 3 3 3 2 3 3 0 3 3 3 0 3 0
0 3 4 3 2 2 3 3 3 3 5 5 3 3 3 3 3 2 3 3 4 0 3 3 3 3 3 3 3 2 3 3 5 5 3 3 1
3 3 3 3 3 3 3 3 3 5 3 4 3 5 3 3 3 3 3 3 3 0 5 2 5 0 3 3 3 5 3 3 3 3 4 0 0
3 3 3 3 0 5 3 3 3 3 3 2 5 3 3 3 2 3 3 3 3 2 3 3 3 5 4 3 3 3 3 3 3 3 3 5 3]
income["workclass"]=col.codes
for name in ["education", "marital_status", "occupation", "relationship", "race", "sex", "native_country", "high_income"]:
col = pandas.Categorical.from_array(income[name])
income[name]=col.codes
private_income = income[income["workclass"]==4]
public_income = income[income["workclass"]!=4]
print private_income.head(2)
print public_income.head(2)
age workclass fnlwgt education education_num marital_status \
54 47 4 109832 11 9 0
68 49 4 191681 15 10 2
occupation relationship race sex capital_gain capital_loss \
54 4 1 4 1 0 0
68 4 0 4 1 0 0
hours_per_week native_country high_income
54 60 26 0
68 50 26 1
age workclass fnlwgt education education_num marital_status \
0 39 6 77516 9 13 4
1 50 5 83311 9 13 2
occupation relationship race sex capital_gain capital_loss \
0 1 1 4 1 2174 0
1 4 0 4 1 0 0
hours_per_week native_country high_income
0 40 26 0
1 13 26 0
import math
import numpy as np
def calc_entropy(column):
count = np.bincount(column)
probality = count/float(len(column))
entropy=0
for pro in probality:
if pro>0:
entropy += pro*math.log(pro,2)
return -entropy
calc_entropy([1,1,0,0,1])
0.97095059445466858
high_entropy=calc_entropy(income["high_income"])
median_age=income["age"].median()
left_age = income[income["age"]<=median_age]
right_age = income[income["age"]>median_age]
info_gain = high_entropy - (left_age.shape[0]/len(income) * calc_entropy(left_age["high_income"]) +
right_age.shape[0]/len(income) * calc_entropy(right_age["high_income"]))
print info_gain
0.756141116271
print np.bincount([1,1,0,0,1,3,3,5])
[2 3 0 2 0 1]
def calc_information_gain(data,split_name,target_name):
high_entropy=calc_entropy(income[target_name])
median_age=income[split_name].median()
left_age = income[income[split_name]<=median_age]
right_age = income[income[split_name]>median_age]
to_sub_result=0
for sub in [left_age,right_age]:
proba = sub.shape[0]/float(len(income))
to_sub_result += proba*calc_entropy(sub[target_name])
return high_entropy-to_sub_result
gain=calc_information_gain(income,"age","high_income")
print gain
columns = ["age", "workclass", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"]
information_gains = []
for sub in columns:
gain=calc_information_gain(income,sub,"high_income")
information_gains.append(gain)
index = information_gains.index(max(information_gains))
print information_gains[index]
0.0501271848501
0.125533153029
def find_best_column(data, target_name, columns):
for sub in columns:
gain=calc_information_gain(data,sub,target_name)
information_gains.append(gain)
index = information_gains.index(max(information_gains))
return information_gains[index]
find_best_column(income,"high_income",columns)
0.12553315302923063