## 将libsvm转为dataframe
from sklearn.datasets import load_svmlight_file
from pandas import DataFrame
import pandas as pd
X_train, y_train = load_svmlight_file("libsvm_data.txt")
mat = X_train.todense()
df1 = pd.DataFrame(mat)
df1.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
df2 = pd.DataFrame(y_train)
df2.columns = ['target']
df = pd.concat([df2, df1], axis=1) # 第一列为target
df.to_csv("df_data.txt", index=False)
如果libsvm文件的特征索引是乱序的,直接使用load_svmlight_file读取会报错,采用下面的函数将每行数据的索引转化为正序排列:
## 将索引乱序的libsvm文件转化为索引排序的文件
def libsvm_index_order(input_file, out_file):
with open(input_file, 'r') as f_in, open(out_file, 'w') as f_out:
for line in f_in.readlines():
items = line.strip().split()
features = {}
for i in range(1, len(items)):
key, value = items[i].split(":")
features[int(key)] = value
features_sort = sorted(features.items(), key = lambda k: k[0])
row_order = items[0]
for item in features_sort:
feature = ":".join((str(item[0]), item[1]))
row_order = row_order + " " + feature
f_out.write(row_order + "\n")
input_file = "./ml-tag.train.libfm"
out_file = "./ml-tag.train.order.libfm"
libsvm_index_order(input_file, out_file)
## 将dataframe转为libsvm
import pandas as pd
from sklearn.datasets import dump_svmlight_file
df = pd.read_csv("data.txt") # 第一个字段为target
y = df.target # y为数据的label值
dummy = pd.get_dummies(df.iloc[:, 1:])
mat = dummy.as_matrix()
dump_svmlight_file(mat, y, 'svm_output.libsvm', zero_based=False) # 默认为zero_based=True,转换后的字段编号从0开始
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
class FFMFormatPandas:
def __init__(self):
self.field_index_ = None
self.feature_index_ = None
self.y = None
def fit(self, df, y=None):
self.y = y
df_ffm = df[df.columns.difference([self.y])]
if self.field_index_ is None:
self.field_index_ = {col: i for i, col in enumerate(df_ffm)}
if self.feature_index_ is not None:
last_idx = max(list(self.feature_index_.values()))
if self.feature_index_ is None:
self.feature_index_ = dict()
last_idx = 0
for col in df.columns:
vals = df[col].unique()
for val in vals:
if pd.isnull(val):
continue
name = '{}_{}'.format(col, val)
if name not in self.feature_index_:
self.feature_index_[name] = last_idx
last_idx += 1
self.feature_index_[col] = last_idx
last_idx += 1
return self
def fit_transform(self, df, y=None):
self.fit(df, y)
return self.transform(df)
def transform_row_(self, row, t):
ffm = []
if self.y != None:
ffm.append(str(row.loc[row.index == self.y][0]))
if self.y is None:
ffm.append(str(0))
for col, val in row.loc[row.index != self.y].to_dict().items():
col_type = t[col]
name = '{}_{}'.format(col, val)
if col_type.kind == 'O':
ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name]))
elif col_type.kind == 'i':
ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], val))
return ' '.join(ffm)
def transform(self, df):
t = df.dtypes.to_dict()
return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})
########################### Lets build some data and test ############################
train, y = make_classification(n_samples=100, n_features=5, n_informative=2, n_redundant=2, n_classes=2, random_state=42)
train=pd.DataFrame(train, columns=['int1','int2','int3','s1','s2'])
train['int1'] = train['int1'].map(int)
train['int2'] = train['int2'].map(int)
train['int3'] = train['int3'].map(int)
train['s1'] = round(np.log(abs(train['s1'] +1 ))).map(str)
train['s2'] = round(np.log(abs(train['s2'] +1 ))).map(str)
train['clicked'] = y
ffm_train = FFMFormatPandas()
ffm_train_data = ffm_train.fit_transform(train, y='clicked')
print('Base data')
print(train[0:10])
print('FFM data')
print(ffm_train_data[0:10])