把字符串离散化

1.获取字符串的去重后列表
2.构造全为0的数组(DataFrame), columns为字符串的列表
3.给全为0的数组赋值
第一步

import pandas as pd
import numpy as np

df = pd.DataFrame({'a': range(7),
                   'b': range(7, 0, -1),
                   'c': ['one,two,three',
                         'one,two',
                         'two,four',
                         'two,five,four,six',
                         'seven,eight,one',
                         'nine,ten,six,four',
                         'ten,six,two,seven'],
                   'd': list('hjklmno')})
# print(df)
print('=' * 40)
print(df['c'])
"""
0        one,two,three
1              one,two
2             two,four
3    two,five,four,six
4      seven,eight,one
5    nine,ten,six,four
6    ten,six,two,seven
Name: c, dtype: object
"""

a = df['c'].str.split(',')
print(a)
"""
0         [one, two, three]
1                [one, two]
2               [two, four]
3    [two, five, four, six]
4       [seven, eight, one]
5    [nine, ten, six, four]
6    [ten, six, two, seven]
Name: c, dtype: object
"""
print('=' * 50)
a_lst = df['c'].str.split(',').tolist()
print(a_lst)

# [['one', 'two', 'three'], ['one', 'two'], ['two', 'four'],
# ['two', 'five', 'four', 'six'], ['seven', 'eight', 'one'],
# ['nine', 'ten', 'six', 'four'], ['ten', 'six', 'two', 'seven']]

print('*' * 60)
new_lst = []
for i in a_lst:
    for j in i:
        if j not in new_lst:
            new_lst.append(j)
print(new_lst)
# ['one', 'two', 'three', 'four', 'five',
# 'six', 'seven', 'eight', 'nine', 'ten']

第二步

df_zeros = pd.DataFrame(data=np.zeros((df.shape[0], len(new_lst))), columns=new_lst)
print(df_zeros)
"""
   one  two  three  four  five  six  seven  eight  nine  ten
0  0.0  0.0    0.0   0.0   0.0  0.0    0.0    0.0   0.0  0.0
1  0.0  0.0    0.0   0.0   0.0  0.0    0.0    0.0   0.0  0.0
2  0.0  0.0    0.0   0.0   0.0  0.0    0.0    0.0   0.0  0.0
3  0.0  0.0    0.0   0.0   0.0  0.0    0.0    0.0   0.0  0.0
4  0.0  0.0    0.0   0.0   0.0  0.0    0.0    0.0   0.0  0.0
5  0.0  0.0    0.0   0.0   0.0  0.0    0.0    0.0   0.0  0.0
6  0.0  0.0    0.0   0.0   0.0  0.0    0.0    0.0   0.0  0.0
"""

方法二(数据量大的情况下使用)

for i in new_lst:
    df_zeros[i][df['c'].str.contains(i)] = 1
print(df_zeros)

 

第三步

for i in range(df_zeros.shape[0]):
    df_zeros.loc[i, a_lst[i]] = 1

print(df_zeros)
"""
   one  two  three  four  five  six  seven  eight  nine  ten
0  1.0  1.0    1.0   0.0   0.0  0.0    0.0    0.0   0.0  0.0
1  1.0  1.0    0.0   0.0   0.0  0.0    0.0    0.0   0.0  0.0
2  0.0  1.0    0.0   1.0   0.0  0.0    0.0    0.0   0.0  0.0
3  0.0  1.0    0.0   1.0   1.0  1.0    0.0    0.0   0.0  0.0
4  1.0  0.0    0.0   0.0   0.0  0.0    1.0    1.0   0.0  0.0
5  0.0  0.0    0.0   1.0   0.0  1.0    0.0    0.0   1.0  1.0
6  0.0  1.0    0.0   0.0   0.0  1.0    1.0    0.0   0.0  1.0
"""

 

 

你可能感兴趣的:(把字符串离散化)