1.获取字符串的去重后列表
2.构造全为0的数组(DataFrame), columns为字符串的列表
3.给全为0的数组赋值
第一步
import pandas as pd import numpy as np df = pd.DataFrame({'a': range(7), 'b': range(7, 0, -1), 'c': ['one,two,three', 'one,two', 'two,four', 'two,five,four,six', 'seven,eight,one', 'nine,ten,six,four', 'ten,six,two,seven'], 'd': list('hjklmno')}) # print(df) print('=' * 40) print(df['c']) """ 0 one,two,three 1 one,two 2 two,four 3 two,five,four,six 4 seven,eight,one 5 nine,ten,six,four 6 ten,six,two,seven Name: c, dtype: object """ a = df['c'].str.split(',') print(a) """ 0 [one, two, three] 1 [one, two] 2 [two, four] 3 [two, five, four, six] 4 [seven, eight, one] 5 [nine, ten, six, four] 6 [ten, six, two, seven] Name: c, dtype: object """ print('=' * 50) a_lst = df['c'].str.split(',').tolist() print(a_lst) # [['one', 'two', 'three'], ['one', 'two'], ['two', 'four'], # ['two', 'five', 'four', 'six'], ['seven', 'eight', 'one'], # ['nine', 'ten', 'six', 'four'], ['ten', 'six', 'two', 'seven']] print('*' * 60) new_lst = [] for i in a_lst: for j in i: if j not in new_lst: new_lst.append(j) print(new_lst) # ['one', 'two', 'three', 'four', 'five', # 'six', 'seven', 'eight', 'nine', 'ten']
第二步
df_zeros = pd.DataFrame(data=np.zeros((df.shape[0], len(new_lst))), columns=new_lst) print(df_zeros) """ one two three four five six seven eight nine ten 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 6 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 """
方法二(数据量大的情况下使用)
for i in new_lst: df_zeros[i][df['c'].str.contains(i)] = 1 print(df_zeros)
第三步
for i in range(df_zeros.shape[0]): df_zeros.loc[i, a_lst[i]] = 1 print(df_zeros) """ one two three four five six seven eight nine ten 0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 3 0.0 1.0 0.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 4 1.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0 5 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 1.0 6 0.0 1.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0 1.0 """