libsvm/libffm与dataframe格式相互转换

1. libsvm与dataframe格式相互转换

1.1 libsvm转化为dataframe
## 将libsvm转为dataframe
from sklearn.datasets import load_svmlight_file
from pandas import DataFrame
import pandas as pd

X_train, y_train = load_svmlight_file("libsvm_data.txt")
mat = X_train.todense() 

df1 = pd.DataFrame(mat)
df1.columns = ['sepal_length',  'sepal_width',  'petal_length',  'petal_width']

df2 = pd.DataFrame(y_train)
df2.columns = ['target']

df = pd.concat([df2, df1], axis=1)      # 第一列为target
df.to_csv("df_data.txt", index=False)

如果libsvm文件的特征索引是乱序的,直接使用load_svmlight_file读取会报错,采用下面的函数将每行数据的索引转化为正序排列:

## 将索引乱序的libsvm文件转化为索引排序的文件
def libsvm_index_order(input_file, out_file):
    with open(input_file, 'r') as f_in, open(out_file, 'w') as f_out:
        for line in f_in.readlines():
            items = line.strip().split()
            features = {}
            for i in range(1, len(items)):
                key, value = items[i].split(":")
                features[int(key)] = value
            features_sort = sorted(features.items(), key = lambda k: k[0])
            row_order = items[0]
            for item in features_sort:
                feature = ":".join((str(item[0]), item[1]))
                row_order = row_order + " " + feature

            f_out.write(row_order + "\n")

input_file = "./ml-tag.train.libfm"
out_file = "./ml-tag.train.order.libfm"
libsvm_index_order(input_file, out_file)
1.2 dataframe转化为libsvm
## 将dataframe转为libsvm
import pandas as pd
from sklearn.datasets import dump_svmlight_file

df = pd.read_csv("data.txt")      # 第一个字段为target
y = df.target      # y为数据的label值
dummy = pd.get_dummies(df.iloc[:, 1:])
mat = dummy.as_matrix()
dump_svmlight_file(mat, y, 'svm_output.libsvm', zero_based=False)      # 默认为zero_based=True,转换后的字段编号从0开始

2. dataframe转换为libffm格式

import numpy as np
import pandas as pd
from sklearn.datasets import make_classification

class FFMFormatPandas:
    def __init__(self):
        self.field_index_ = None
        self.feature_index_ = None
        self.y = None

    def fit(self, df, y=None):
        self.y = y
        df_ffm = df[df.columns.difference([self.y])]
        if self.field_index_ is None:
            self.field_index_ = {col: i for i, col in enumerate(df_ffm)}

        if self.feature_index_ is not None:
            last_idx = max(list(self.feature_index_.values()))

        if self.feature_index_ is None:
            self.feature_index_ = dict()
            last_idx = 0

        for col in df.columns:
            vals = df[col].unique()
            for val in vals:
                if pd.isnull(val):
                    continue
                name = '{}_{}'.format(col, val)
                if name not in self.feature_index_:
                    self.feature_index_[name] = last_idx
                    last_idx += 1
            self.feature_index_[col] = last_idx
            last_idx += 1
        return self

    def fit_transform(self, df, y=None):
        self.fit(df, y)
        return self.transform(df)

    def transform_row_(self, row, t):
        ffm = []
        if self.y != None:
            ffm.append(str(row.loc[row.index == self.y][0]))
        if self.y is None:
            ffm.append(str(0))

        for col, val in row.loc[row.index != self.y].to_dict().items():
            col_type = t[col]
            name = '{}_{}'.format(col, val)
            if col_type.kind ==  'O':
                ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name]))
            elif col_type.kind == 'i':
                ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], val))
        return ' '.join(ffm)

    def transform(self, df):
        t = df.dtypes.to_dict()
        return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})

########################### Lets build some data and test ############################

train, y = make_classification(n_samples=100, n_features=5, n_informative=2, n_redundant=2, n_classes=2, random_state=42)

train=pd.DataFrame(train, columns=['int1','int2','int3','s1','s2'])
train['int1'] = train['int1'].map(int)
train['int2'] = train['int2'].map(int)
train['int3'] = train['int3'].map(int)
train['s1'] = round(np.log(abs(train['s1'] +1 ))).map(str)
train['s2'] = round(np.log(abs(train['s2'] +1 ))).map(str)
train['clicked'] = y


ffm_train = FFMFormatPandas()
ffm_train_data = ffm_train.fit_transform(train, y='clicked')
print('Base data')
print(train[0:10])
print('FFM data')
print(ffm_train_data[0:10])

你可能感兴趣的:(编程语言)