参考以下文章
原文链接:https://blog.insightdatascience.com/how-to-solve-90-of-nlp-problems-a-step-by-step-guide-fda605278e4e
注意:labels
原则:再好的模型也拯救不了shi一样的数据。
pd.set_option("display.max_columns",None,"display.max_colwidth",200)
df = pd.read_csv("")
主要使用groupby 和pivote_table
df["revivew"] = df.review.apply(lambda x:BeautifulSoup(x,"html.parser).get_text())
df["text"].str.replace(r"[^A-Za-z0-9,.!'?]"," ")
df["text"].str.replace(r"@","at")
df["text"].str.lower()
df["text"]=df["text"].str.replace(r"i'm","i am")
df["token"] = df.review.apply(nltk.word_tokenize)
def stemmer(text):
b = [];porter = nltk.PorterStermmer()
for w in text:
a = porter.stem(w);b.append(a)
return b
df["token"] = df.token.apply(stemer)
def lemmer(text):
b = [];
porter = nltk.stem.WordNetLemmatizer()
for w in text:
a = porter.lemmatize(w);b.append(a)
return b
df["token"] = df.token.apply(lemmer)
stop = stopwords.words("english")
def remove(text):
a = [w for w in text if w not in stop]
return " ".join(a)
df["token"] = df.token.apply(remove)
df.to_excel("a.xlsx")
df.to_csv("a.csv")
list_corpus = df["text"].tolist()
list_label = df["label"].tolist()
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(list_corpus,list_label,test_size=0.2,random_state=1)
from sklearn.feature_extraction.text import CountVectorizer
def cv(text):
counter = CounterVectorizer()
emb = counter.fit_trainsform(text)
return emb,counter
x_train,counter = cv(x_train)
x_test = counter.transform(text)
from sklearn.feature_extraction.text import TfidfVectorizer
def tfidf(text):
tfidf = TfidfVectorizer()
emb = tfidf.fit_transform(text)
return emb,tfidf
x_train.counter = tfidf(x_train)
x_test = counter.transform(text)
import gesim
import numpy as np
from gesim.models import Word2Vec
model = Word2Vec(sentences=list_corpus,size=300,window=5,min_count=5,sample=1e-3,sg=1)
model.save("bag") # 注意:此处sentence 是分词过后的。例子:[["i","love","you"],["do","you","love","me"]]
用平均的方法将句子转换成向量 再进行训练集划分
word2vector = Word2Vec.load("bag")
def average(text,size=300)
if len(text) < 1:
return np.zeros(size)
a = [word2vector[w] if w in word2vector else np.zeros(size) for w in text]
ength = len(a)
summed = np.sum(a,axis=0)
ave = np.divide(summed,length)
return ave
df["text"] = df["text"].apply(average) 注意此处的df["text"] 未分词
list_corpus = df["text"].tolist()
list_label = df["label"].tolist()
x _train,x_test,y_train,y_test = trian_test_split(list_corpus,list_label,test_size=0.2,random_state=1)
此过程进行调参,参数C也就是正则化惩罚项前的系数对结果影响很多大,不同的正则项对应的优化方法不同。
from sklean.linear_model import LogisticRegression
clf = LogisticRegression(penalty="l2",C=1.0,class_weight="balanced",n_jobs=-1,random_state=1.0,solver="newton-cg")
clf.fit(x_train,y_train)
y_predict = clf.fit(x_test)
from sklearn.metrics import precision_score,accuracy_score,recall_score,f1_score
from sklearn.metrics import confusion_matrix
precision = precision_score(y_test,y_predic,pos_label=None,average="weighted")
accuracy = accuracy_score(y_test,y_predict)
recall = recall_score(y_test,y_predict,,pos_label=None,average="weighted")
f1 = f1_score(y_test,y_predict,pos_label,average="weighted")
cm = confusion_matrix(y_test,y_predict)
TF-IDF:关键词、可视化嵌入
word2vec、glove、cove