NLP(一 )文本分类问题

前言

#转为Tensor 数据类型
tf.convert_to_tensor(my_np_array, dtype=tf.float32)
torch.FloatTensor(py_list)
import pandas as pd
def file_process(file_path):
    with open(file_path,'r',encoding='utf_8') as f:
        labels=[]
        texts=[]
        for line in f:
            line=line.split(' ')
            labels.append(line[0])
            texts.append(line[1])     
    return labels,texts
def describe_file(file_path):
    labels,texts=file_process(file_path)
    file_df=pd.DataFrame({'label':labels,'text':texts})
    file_length=file_df['text'].apply(lambda x:len(x))
    return file_df
test_df=describe_file(path)
print(test_df)

#encoding part
!pip install transformers
from transformers import BertTokenizer, TFBertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained("bert-base-uncased")


test_text=[text for text in test_df['text']]
encoded_input = tokenizer(test_text,padding=True ,return_tensors='tf')#tokenizer 处理(list(str)或者str type)
output = model(encoded_input)

import tensorflow as tf
y_test=[int(label) for label in test_df['label']]
y_train=output['pooler_output']
y_test=tf.convert_to_tensor(y_test, dtype=tf.float32)
# y_test=torch.FloatTensor(y_test)
print(output['pooler_output'].shape,len(y_test))
print(type(y_test),type(y_train))

#model part
from keras.models import Sequential,Model
from keras.layers import LSTM, Dense, Embedding, Dropout,Input
from tensorflow.keras.optimizers import Adam
def build_classifier_model():
  x_input=Input(shape=(768,))
  x_out=Dense(4,activation='relu')(x_input)
  x_out=Dense(1,activation='softmax')(x_out)
  return Model(x_input,x_out)
classifier_model=build_classifier_model()
print(classifier_model.summary())
classifier_model.compile(loss='categorical_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])
classifier_model.fit(y_train,y_test,epochs=2)

你可能感兴趣的:(分类)