13-垃圾邮件分类2

1读取

file_path = r'C:\Users\jiqixuexi\SMSSpamCollection'
sms = open(file_path, 'r', encoding='utf-8')
sms_data = []
sms_label = []
csv_reader = csv.reader(sms, delimiter='\t')
for line in csv_reader:
    sms_label.append(line[0])
    sms_data.append(preprocessing(line[1]))  # 对每封邮件进行预处理
sms.close()
 
2数据预处理
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):  # 形容词
        return nltk.corpus.wordnet.ADJ
    elif treebank_tag.startswith('V'):  # 动词
        return nltk.corpus.wordnet.VERB
    elif treebank_tag.startswith('N'):  # 名词
        return nltk.corpus.wordnet.NOUN
    elif treebank_tag.startswith('R'):  # 副词
        return nltk.corpus.wordnet.ADV
    else:
        return nltk.corpus.wordnet.NOUN

def preprocessing(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]  # 分词
    stops = stopwords.words('english')
    tokens = [token for token in tokens if token not in stops]  # 去掉停用词
    tokens = [token.lower() for token in tokens if len(token) >= 3]
    tag = nltk.pos_tag(tokens)  # 标注词性
    lemmatizer = WordNetLemmatizer()  # 词性还原
    tokens = [lemmatizer.lemmatize(token, pos=get_wordnet_pos(tag[i][1])) for i, token in enumerate(tokens)]  # 词性还原
    preprocessed_text = ''.join(tokens)
    return preprocessed_text
 
3.数据划分—训练集和测试集数据划分
x_train, x_test, y_train, y_test = train_test_split(sms_data, sms_label, test_size=0.2, stratify=sms_label)
print('总集:', len(sms_label))
print('训练集:', len(x_train))
print('测试集:', len(y_test)
 
 

你可能感兴趣的:(13-垃圾邮件分类2)