文本挖掘流程示例

文本挖掘流程示例_第1张图片

import numpy as np
import pandas as pd
from contextlib import contextmanager
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import time
import re
import string
from scipy.sparse import csr_matrix
from sklearn.preprocessing import MinMaxScaler
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import gc
from collections import defaultdict
import os
import psutil


# 查看运行内存
# memory usage
def cpuStats(disp=""):
    """ @author: RDizzl3 @address: https://www.kaggle.com/rdizzl3"""
    pid = os.getpid()
    py = psutil.Process(pid)
    memoryUse = py.memory_info()[0] / 2. ** 30
    print("%s MEMORY USAGE for PID %10d : %.3f" % (disp, pid, memoryUse))

# 上下文管理
# context manager
@contextmanager
def timer(name):
    """ Taken from Konstantin """
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')


# 使用正则匹配将缩略词还原
# regex match
# eg. replace "won't" with "will not" and so on
cont_patterns = [
        (b'US', b'United States'),
        (b'IT', b'Information Technology'),
#         (b"[^a-zA-Z]u[^a-zA-Z]", b"you"),
#         (b"[^a-zA-Z]r[^a-zA-Z]", b"are"),
#         (b"[^a-zA-Z]y[^a-zA-Z]", b"why"),
#         (b"[^a-zA-Z]b4[^a-zA-Z]", b"before"),
        (b'(W|w)on\'t', b'will not'),
        (b'(C|c)an\'t', b'can not'),
        (b'(I|i)\'m', b'i am'),
        (b'(A|a)in\'t', b'is not'),
        (b'(\w+)\'ll', b'\g<1> will'),
        (b'(\w+)n\'t', b'\g<1> not'),
        (b'(\w+)\'ve', b'\g<1> have'),
        (b'(\w+)\'s', b'\g<1> is'),
        (b'(\w+)\'re', b'\g<1> are'),
        (b'(\w+)\'d', b'\g<1> would'),
    ]
patterns = [(re.compile(regex), repl) for (regex, repl) in cont_patterns]


# 数据清洗函数
# prepare for char n-gram
def prepare_for_char_n_gram(text):

  # 转换大小写
  # 1. lower word
  clean = bytes(text.lower(), encoding="utf-8")

  # 除去换行符等
  # 2. remove \n and \t with " " 
  clean = clean.replace(b"\n", b" ")
  clean = clean.replace(b"\t", b" ")
  clean = clean.replace(b"\b", b" ")
  clean = clean.replace(b"\r", b" ")

  # 替换缩略词
  # 3. using regex to replace contractions
  for (pattern, repl) in patterns:
      clean = re.sub(pattern, repl, clean)

  # 除去标点符号
  # 4. remove punctuation
  # clean = re.sub(b'[^\w^\s]+', b' ', clean)
  exclude = re.compile(b'[%s]' % re.escape(bytes(string.punctuation, encoding='utf-8')))
  clean = b" ".join([exclude.sub(b'', token) for token in clean.split()])

  # 除去数字
  # 6. remove numbers
  clean = re.sub(b"\d+", b" ", clean)

  # 除去多余空白符
  # 7. remove extra spaces
  clean = re.sub(b'\s+', b' ', clean)

  # 将单词转成#包围形式#
  # 5. replace words by words surrounded by # signs 
  # e.g. my name is bond would become #my# #name# #is# #bond# 
  # clean = re.sub(b"([a-z]+)", b"#\g<1>#", clean)
  clean = re.sub(b" ", b"# #", clean)

  # 除去结尾空白符
  # remove ending space if any
  clean = re.sub(b'\s+$', b'', clean)

  return str(clean, 'utf-8')

# 统计相同匹配的数量 
#  number of same word
def count_regexp_occ(regexp="", text=None):
    return len(re.findall(regexp, text))


# 提取特征并清洗数据
# extract feature and clean data
def perform_nlp(df):
    # Check all sorts of content as it may help find toxic comment

    # 换行符数量
    # number of \n
    df["ant_slash_n"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\n", x))

    # 单词数量
    # number of word
    df["raw_word_len"] = df["comment_text"].apply(lambda x: len(x.split()))

    # 字符数量
    # number of char
    df["raw_char_len"] = df["comment_text"].apply(lambda x: len(x))

    # 大写字母数量
    # number of A-Z
    df["nb_upper"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"[A-Z]", x))

    # 检测是否有ip地址
    # Check for IP addresses
    ip_regexp = r" ( ([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]) \.) {3} ([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]) $"
    # df["has_ip_address"] = df["comment_text"].apply(lambda x: count_regexp_occ(ip_regexp, x))
    # Remove IP address

    # 检测是否有fuck等词
    # Number of fuck - f..k contains folk, fork,
    df["nb_fk"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"[Ff]\S{2}[Kk]", x))

    # 检测是否有such等词
    # Number of suck
    df["nb_sk"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"[Ss]\S{2}[Kk]", x))
    df["nb_dk"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"[dD]ick", x))
    df["nb_you"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\W[Yy]ou\W", x))
    df["nb_mother"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\Wmother\W", x))
    df["nb_nigger"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\Wnigger\W", x))

    # start with :
    df["start_with_columns"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"^\:+", x))

    # 检测时间戳
    # check for time stamp
    df["has_timestamp"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\d{2}|:\d{2}", x))
    # Remove timestamp

    # 检测日期
    # Check for dates 18:44, 8 December 2010
    df["has_date_long"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\D\d{2}:\d{2}, \d{1,2} \w+ \d{4}", x))
    # Remove dates

    # 检测短日期
    # Check for date short 8 December 2010
    df["has_date_short"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\D\d{1,2} \w+ \d{4}", x))

    # 检测http链接
    # Check for http links
    df["has_http"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"http[s]{0,1}://\S+", x))
    # Remove http links

    # 检测email
    # check for mail
    df["has_mail"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\S+\@\w+\.\w+", x))
    # remove mail

    # 是否有强调符号
    # number of emphasizing signal
    df["has_emphasize_equal"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\={2}.+\={2}", x))
    df["has_emphasize_quotes"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\"{4}\S+\"{4}", x))

    # 数据清洗
    ####### clean
    ####### clean
    # clean comment_text
    df["clean_comment"] = df["comment_text"].apply(lambda x: prepare_for_char_n_gram(x))

    # 清洗后word数量
    # number of clean comments word
    df["clean_word_len"] = df["clean_comment"].apply(lambda x: len(x.split()))

    # 清洗后char数量
    # number of clean comments char
    df["clean_char_len"] = df["clean_comment"].apply(lambda x: len(x))

    # 清洗后char set数量
    # number and ratio of char set
    df["clean_chars"] = df["clean_comment"].apply(lambda x: len(set(x)))
    df["clean_chars_ratio"] = df["clean_comment"].apply(lambda x: len(set(x))) / df["clean_comment"].apply(lambda x: 1 + min(99, len(x)))

    # 清洗后word数量
    # number of clean comments word
    df["clean_word_len"] = df["clean_comment"].apply(lambda x: len(x.split()))

    # 清洗后char数量
    # number of clean comments char
    df["clean_char_len"] = df["clean_comment"].apply(lambda x: len(x))


# 3-gram char
def char_analyzer(text):
  tokens = text.split()
  return [token[i: i+3] for token in tokens for i in range(len(token) - 2)]


if __name__ == '__main__':

    # start garbage collect
    gc.enable()

    # label
    class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

    # read data
    with timer("Reading input files"):
        train = pd.read_csv('../input/train.csv').fillna(' ')
        test = pd.read_csv('../input/test.csv').fillna(' ')
    # CPU state
    cpuStats()

    # clean data
    with timer("Performing basic NLP"):
        perform_nlp(train)
        perform_nlp(test)
        # train["ant_slash_n"] = train["comment_text"].apply(lambda x: count_regexp_occ(r"\n", x))
        # test["ant_slash_n"] = test["comment_text"].apply(lambda x: count_regexp_occ(r"\n", x))
        # train['clean_comment'] = train["comment_text"]
        # test['clean_comment'] = test["comment_text"]
    # CPU state
    cpuStats()

    # cleaned comments
    train_text = train['clean_comment']
    test_text = test['clean_comment']
    all_text = pd.concat([train_text, test_text])

    # MinMaxScaler some feature and Compress sparse mtrix
    with timer("Creating numerical features"):
        num_features = [f_ for f_ in train.columns
                        if f_ not in ["comment_text", "clean_comment", "id", "remaining_chars", 'has_ip_address'] + class_names]

        skl = MinMaxScaler()
        train_num_features = csr_matrix(skl.fit_transform(train[num_features]))
        test_num_features = csr_matrix(skl.fit_transform(test[num_features]))
    cpuStats()

    # TfidfVectorizer
    with timer("Tfidf on word"):
        word_vectorizer = TfidfVectorizer(
            sublinear_tf=True,
            strip_accents='unicode',
            analyzer='word',
            token_pattern=r'\w{1,}',
            stop_words='english',
            ngram_range=(1, 2),
            max_features=20000)
        word_vectorizer.fit(all_text)
        train_word_features = word_vectorizer.transform(train_text)
        test_word_features = word_vectorizer.transform(test_text)
    del word_vectorizer
    # handle garbage collecollectct
    gc.collect()
    cpuStats()

    # Tfidf with n-gram
    with timer("Tfidf on char n_gram"):
        char_vectorizer = TfidfVectorizer(
            sublinear_tf=True,
            strip_accents='unicode',
            tokenizer=char_analyzer,
            analyzer='word',
            ngram_range=(1, 1),
            max_features=50000)
        char_vectorizer.fit(all_text)
        train_char_features = char_vectorizer.transform(train_text)
        test_char_features = char_vectorizer.transform(test_text)

    # with timer("Tfidf on char n_gram"):
    #     char_vectorizer = TfidfVectorizer(
    #         sublinear_tf=True,
    #         strip_accents='unicode',
    #         analyzer='char',
    #         stop_words='english',
    #         ngram_range=(1, 5),
    #         max_features=20000)
    #     char_vectorizer.fit(all_text)
    #     train_char_features = char_vectorizer.transform(train_text)
    #     test_char_features = char_vectorizer.transform(test_text)
    del char_vectorizer
    gc.collect()

    print((train_char_features>0).sum(axis=1).max())

    del train_text
    del test_text
    gc.collect()
    cpuStats()


    # merge train and test features
    with timer("Staking matrices"):
        # three type of train features
        csr_trn = hstack(
            [
                train_char_features, 
                train_word_features, 
                train_num_features
            ]
        ).tocsr()


        # del train_word_features
        del train_num_features
        del train_char_features
        gc.collect()

        # three type of test features
        csr_sub = hstack(
            [
                test_char_features, 
                test_word_features, 
                test_num_features
            ]
        ).tocsr()

        # del test_word_features
        del test_num_features
        del test_char_features
        gc.collect()

    # get id 
    submission = pd.DataFrame.from_dict({'id': test['id']})
    del test
    gc.collect()

    # drop some features
    drop_f = [f_ for f_ in train if f_ not in ["id"] + class_names]
    train.drop(drop_f, axis=1, inplace=True)
    gc.collect()
    cpuStats()

    # KFold LR Train
    with timer("Scoring LogisticRegression"):

        # KFold dataset
        scores = []
        folds = KFold(n_splits=4, shuffle=True, random_state=1)
        lgb_round_dict = defaultdict(int)
        # load dataset
        trn_lgbset = lgb.Dataset(csr_trn, free_raw_data=False) 

        cpuStats("LGB Dataset created")
        del csr_trn
        gc.collect()

        # single classification lr
        cpuStats("Training csr matrix freed")
        for class_name in class_names:
            print("Class %s scores : " % class_name)
            class_pred = np.zeros(len(train))
            # set label
            train_target = train[class_name]
            trn_lgbset.set_label(train_target.values)
            params = {
                "objective": "binary",
                'metric': {'auc'},
                "boosting_type": "gbdt",
                "verbosity": -1,
                "num_threads": 4,
                "bagging_fraction": 0.8,
                "feature_fraction": 0.8,
                "learning_rate": 0.1,
                "num_leaves": 31,
                "verbose": -1,
                "min_split_gain": .1,
                "reg_alpha": .1
            }
            lgb_rounds = 500

            for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train, train_target)):

                watchlist = [
                    trn_lgbset.subset(trn_idx), 
                    trn_lgbset.subset(val_idx)
                ]
                # Train lgb l1
                model = lgb.train(
                    params=params,
                    train_set=watchlist[0],
                    num_boost_round=lgb_rounds,
                    valid_sets=watchlist,
                    early_stopping_rounds=50,
                    verbose_eval=0
                )
                class_pred[val_idx] = model.predict(trn_lgbset.data[val_idx], num_iteration=model.best_iteration)
                score = roc_auc_score(train_target.values[val_idx], class_pred[val_idx])
                lgb_round_dict[class_name] += model.best_iteration
                print("\t Fold %d : %.6f in %3d rounds" % (n_fold + 1, score, model.best_iteration))
                cpuStats("End of fold %d" % (n_fold + 1))
            print("full score : %.6f" % roc_auc_score(train_target, class_pred))
            scores.append(roc_auc_score(train_target, class_pred))
            train[class_name + "_oof"] = class_pred

        # Save OOF predictions
        train[["id"] + class_names + [f + "_oof" for f in class_names]].to_csv("lvl0_lgbm_clean_oof.csv",
                                                                    index=False,
                                                                    float_format="%.8f")

        print('Total CV score is {}'.format(np.mean(scores)))

    # LR predict
    with timer("Predicting probabilities"):
        params = {
                "objective": "binary",
                'metric': {'auc'},
                "boosting_type": "gbdt",
                "verbosity": -1,
                "num_threads": 4,
                "bagging_fraction": 0.8,
                "feature_fraction": 0.8,
                "learning_rate": 0.1, 
                "num_leaves": 31,
                "verbose": -1,
                "min_split_gain": .1,
                "reg_alpha": .1
            }

        for class_name in class_names:
            with timer("Predicting probabilities for %s" % class_name):
                train_target = train[class_name]
                trn_lgbset.set_label(train_target.values)
                # Train lgb
                model = lgb.train(
                    params=params,
                    train_set=trn_lgbset,
                    num_boost_round=int(lgb_round_dict[class_name] / folds.n_splits)
                )
                # predict
                submission[class_name] = model.predict(csr_sub, num_iteration=model.best_iteration)

# sub file
submission.to_csv("lvl0_lgbm_clean_sub.csv", index=False, float_format="%.8f")

你可能感兴趣的:(NLP)