tf-idf +逻辑回归来识别垃圾文本

引入相关包

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, auc, roc_auc_score
import joblib
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import pickle

path  = '/Users/xinghuatianying/data/DataSets/im_cheat/'
os.chdir(path)

加载数据

训练样本demo:

label	msg
1       全网最低价完善数据 手把手教你引流 详细+V:vd12388
1       耍.微.店等.级评.价销.量回头.率➕15314268311
1       我收姐妹+我QQ2877613260
1       违规啥啊 佳651815289➕我q
0       温和洁面膏+红粉爽肤水
0       小卡盲盒改地址
0       20杯(每个口味各4杯)
def load_data():

    data = pd.read_csv('msg_train.csv', sep = "\t", names=['label', 'msg'])
    #对数据进行随机打乱
    data = data.sample(frac=1, random_state=42)
    # print(data.shape)
    # print(data.head(10))
    #查看0-1的比例,可以看出来,数据集基本上平衡
    # print(data['label'].value_counts())
    # 对文本进行分字
    data['msg'] = data['msg'].apply(lambda x: ' '.join(x))
    # print(data.head())

    x_train, x_test, y_train, y_test = \
    train_test_split(data['msg'],
                     data['label'],
                     test_size=0.3,
                     random_state=42
                     )
    # print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
    return x_train, x_test, y_train, y_test

模型训练

def train_model(x_train, x_test, y_train, y_test):
    #tf-idf训练
    vectorizer_word = TfidfVectorizer(
    max_features=800000,
    token_pattern=r"(?u)\b\w+\b",
    min_df=1,
    #max_df=0.1,
    analyzer='word',
    ngram_range=(1, 5)
    )

    tfidf_model = vectorizer_word.fit(x_train)
    # 保存模型到文件
    with open('tfidf_model.pkl', 'wb') as f:
        pickle.dump(tfidf_model, f)

    tfidf_train = vectorizer_word.transform(x_train)
    tfidf_test  = vectorizer_word.transform(x_test)
    #查看词典的大小 vectorizer_word.vocabulary_
    print(len(vectorizer_word.vocabulary_))

    #逻辑回归模型的训练
    lr_word = LogisticRegression(
    solver='sag',
    verbose=2)
    lr_word.fit(tfidf_train, y_train)
    # 保存模型,下次可以直接使用
    joblib.dump(lr_word, 'lr_word_ngram.pkl')

    #模型读取
    model = joblib.load(filename="lr_word_ngram.pkl")
    # 模型预测
    y_pred_word_1 = lr_word.predict(tfidf_test)
    y_pred_word = lr_word.predict_proba(tfidf_test)[:, 1]
    # 模型评估
    print(accuracy_score(y_test, y_pred_word_1))

模型预测

def predcit_main():
    # 拉取数据
    data = pd.read_csv('test_msg.csv', names=['msg'])
    #对数据进行随机打乱
    data = data.sample(frac=1, random_state=42)
    print(data.shape)
    print(data.head(10))

    # 对文本进行分字
    data['msg'] = data['msg'].apply(lambda x: ' '.join(x))
    x_test = data['msg']

    # 从文件中加载模型
    with open('tfidf_model.pkl', 'rb') as f:
        tfidf_model = pickle.load(f)
    print(x_test[:10])
    tfidf_test  = tfidf_model.transform(x_test)

    # 模型读取
    lr_model = joblib.load(filename="lr_word_ngram.pkl")
    # 模型预测
    y_pred_word = lr_model.predict_proba(tfidf_test)[:, 1]
    predict_df = pd.DataFrame({ 'y_pred_word': y_pred_word, 'x_test': x_test})
    # 保存到 CSV 文件
    predict_df.to_csv('predict_test.csv', index=False, sep = "\t")

标题主函数:

if __name__ == '__main__':
    x_train, x_test, y_train, y_test = load_data() # 加载训练数据
    train_model(x_train, x_test, y_train, y_test) # 模型训练
    predcit_main() # 模型预测

你可能感兴趣的:(机器学习算法,tf-idf,逻辑回归,算法)