Word2Vec

文本处理流程:
preprocess:tokenize+lemma/stemming+stopwords+word_list
+make features
+ML

tokenize

英文

import NLTK
sent= ""
tokens = NLTK.word_tokenize(sent)
tokens

中文

import jieba
seg_list = jieba.cut("",cut_all = True)
seg_list = jieba.cut("",cut_all = Flase)
seg_list = jieba.cut_for_search("")

对于复杂的社交语言-re

import re
emoticons_str = r""
regex_str = [emoticons_str, r"",r"",r""]

词性归一化

stemming 词干提取,只留下词根
lemmatization词形归一:把各种类型的词的变形,都归为一个形式

from nltk.atom.portar import PorterStemmer
porter_atomer = PorterStemmer()
portter_stemmer.atom("")

from nltk.stem.lancaster import LancasterStemmer
from nltk.atom import SnowballStemmer
from nltk.atom.porter import PorterStemmer

NLTK实现Lemma

根据单词的词性进行归一化

from nltk.stem import WordNetLemmatizer

NLTK标注POS Tag

得到具体词性

stopwords

指代词会产生歧义
停止词列表www.ranks.nl/stopwords

from nltk.corpus import stopwords
# token
# filter
filtered_words = [word for word in word_list if word not in stop.words('english')]

应用:情感分析

简单方式:英文现成的词语打分表AFINN-111

sentiment_dictionary = {}
for line in open('')
	word,score = line.split('\t')
	sentiment_dictionary(word) = int(score)
total_score = sum(sentiment_dictionary.get(word,0) for word in words)

配上ML的情感分析
可以使用贝叶斯分类器

from nltk.classify import NaiveBayesClassifier
s1 = ""
s2 = ""
s3 = ""
s4 = ""
def preprocess(s):
	return (word:True for word in s.lower().split())
training_data = [[preprocess(s1),'pos'],
				 [preprocess(s2),'pos'],
				 [preprocess(s3),'pos'],
				 [preprocess(s4),'pos'],
				]
model = NaiveBayesClassifier.train(training_data)
print(model.classify(proprocesss('')))

应用:文本相似度

用元素频率表示文本特征

import nltk
from nltk import FreqDist

corpus = ""
tokens = nltk.word_tokenize(corpus)
print(tokens)

dist = FreDist(tokens) # 得到一个词典,每个词分别对应出现的次数

print(fdist[""])

standard_freq_vector = fdict.most_common(50) # 拿出最常用的50,字典形式,key和value分别是词和出现的次数,统计词频只是为了拿出最常用的词,这个词频后面是用不到的,只需要词频高的词组成的一个向量,然后新的句子放到这个向量里,对应有词的位置上的数会加1,输出的还是这个词向量,key和value分别是词和对应的出现次数(稀疏矩阵)
size = len(standard_freq_vector) # 得到一个词典,最常用的50个单词以及词频
print(standard_freq_vector)
# 按照出现频率的大小,记录下每一个单词的位置
def position_lookup(v): # 得到一个词典,key和value分别是单词以及对应的位置
	res = {}
	counter =0
	for word in v:
		res[word[0]] = counter
		counter += 1 
	return res
# 把标准的单词位置记录下来,得到的是常用单词以及对应的位置的词典
standad_position_dict = position_lookup(standard_freq_vector)
print(standard_position_dict) # 得到位置对照表
# 新的输入
sentence = "
freq_vector = [0]*size #建立一个同等长度的词向量
tokens = nltk.word_tokenize(sentence)
for word in tokens:
	try:
		fre_vector[standard_position_dict[word]] += 1 # 通过word对应到该词在向量中为位置,然后再词向量该位置上加1,表示出现了一次
	except KeyError:
		continue
print(fre_vector)

应用:文本分类

TF-IDF

TF:Term Frequency衡量一个term在文档中出现得有多频繁
TF(t) = (t在文档中出现的次数)/(文档中term总数)
IDF:Inverse Document Frequency 衡量一个term有多重要
IDF(t) = log_e(文档总数/含有t的文档总数)
TF-IDF = TF*IDF
栗子:一个文档有100个单词,其中baby出现了3次;
那么TF(baby) = 0.03
如果现在有10M的文档,baby出现在其中的1000个文档中
IDF(baby) = log(10M/1000)=4
TF-IDF = 0.03.*4=0.12

nltk实现TF-IDF

from nltk.text import TextCollectionn

corpus = TextCollection(["","",""]) # 这个类会自动短句,做统计,做计算

print(corpus.tf_idf("","")) # 直接使用该函数,参数为词和所在的话
# 要得到一个标准化的向量(相同的长度)
new_sentence = ""
for word in standard_vocab: # 遍历标准词,求出每个单独的词在新句子中的词频,最后每一个新句子都可以得到一个相同长度的向量
	print(corpus(word,new_sentence))

在上述方法将各种新句子得到相同长度的向量后,可以开始机器学习了

案例:关键词搜索

在线商城网站的搜索,对搜索结果进行相近度的打分。
train是产品名称以及不同搜索词以及搜索到结果的相关性打分
test是给定关搜索词,要求给出对应的产品名的列表,然后将submission提交

#关键词搜索
#Kaggle竞赛题:https://www.kaggle.com/c/home-depot-product-search-relevance

#鉴于课件里已经完整的show了NLTK在各个NLP处理上的用法,我这里就不再重复使用了。

#本篇的教程里会尽量用点不一样的库,让大家感受一下Python NLP领域各个库的优缺点。

#Step1:导入所需
#所有要用到的库

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor # 随机森林
from nltk.stem.snowball import SnowballStemmer # 预处理
#读入训练/测试集 

df_train = pd.read_csv('../input/train.csv', encoding="ISO-8859-1") # 注意编码方式
df_test = pd.read_csv('../input/test.csv', encoding="ISO-8859-1")
#这里还有个有用的玩意儿,叫产品介绍

df_desc = pd.read_csv('../input/product_descriptions.csv')
#看看数据们都长什么样子

df_train.head()
#id	product_uid	product_title	search_term	relevance
#0	2	100001	Simpson Strong-Tie 12-Gauge Angle	angle bracket	3.00
#1	3	100001	Simpson Strong-Tie 12-Gauge Angle	l bracket	2.50
#2	9	100002	BEHR Premium Textured DeckOver 1-gal. #SC-141 ...	deck over	3.00
#3	16	100005	Delta Vero 1-Handle Shower Only Faucet Trim Ki...	rain shower head	2.33
#4	17	100005	Delta Vero 1-Handle Shower Only Faucet Trim Ki...	shower only faucet	2.67
df_desc.head() # 对照表
#product_uid	product_description
#0	100001	Not only do angles make joints stronger, they ...
#1	100002	BEHR Premium Textured DECKOVER is an innovativ...
#2	100003	Classic architecture meets contemporary design...
#3	100004	The Grape Solar 265-Watt Polycrystalline PV So...
#4	100005	Update your bathroom with the Delta Vero Singl...
#看来不要做太多的复杂处理,我们于是直接合并测试/训练集,以便于统一做进一步的文本预处理

df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True) #左右合并df
df_all.head()
#id	product_title	product_uid	relevance	search_term
#0	2	Simpson Strong-Tie 12-Gauge Angle	100001	3.00	angle bracket
#1	3	Simpson Strong-Tie 12-Gauge Angle	100001	2.50	l bracket
#2	9	BEHR Premium Textured DeckOver 1-gal. #SC-141 ...	100002	3.00	deck over
#3	16	Delta Vero 1-Handle Shower Only Faucet Trim Ki...	100005	2.33	rain shower head
#4	17	Delta Vero 1-Handle Shower Only Faucet Trim Ki...	100005	2.67	shower only faucet#
#合并之后我们得到:

df_all.shape
#(240760, 5)
#产品介绍也是一个极有用的信息,我们把它拿过来:

df_all = pd.merge(df_all, df_desc, how='left', on='product_uid') # 继续合并
df_all.head()
#id	product_title	product_uid	relevance	search_term	product_description
#0	2	Simpson Strong-Tie 12-Gauge Angle	100001	3.00	angle bracket	Not only do angles make joints stronger, they #...
#1	3	Simpson Strong-Tie 12-Gauge Angle	100001	2.50	l bracket	Not only do angles make joints stronger, they ...
#2	9	BEHR Premium Textured DeckOver 1-gal. #SC-141 ...	100002	3.00	deck over	BEHR Premium Textured DECKOVER is an innovativ...
#3	16	Delta Vero 1-Handle Shower Only Faucet Trim Ki...	100005	2.33	rain shower head	Update your bathroom with the Delta Vero Singl...
#4	17	Delta Vero 1-Handle Shower Only Faucet Trim Ki...	100005	2.67	shower only faucet	Update your bathroom with the Delta Vero Singl...
#好了,现在我们得到一个全体的数据大表格

#Step 2: 文本预处理
#我们这里遇到的文本预处理比较简单,因为最主要的就是看关键词是否会被包含。

#所以我们统一化我们的文本内容,以达到任何term在我们的数据集中只有一种表达式的效果。

#我们这里用简单的Stem做个例子:

#(有兴趣的同学可以选用各种你觉得靠谱的预处理方式:去掉停止词,纠正拼写,去掉数字,去掉各种emoji,等等)

stemmer = SnowballStemmer('english') # 英文的雪球处理

def str_stemmer(s): # 单词小写、分开做stem,再合并得到单词列表
    return " ".join([stemmer.stem(word) for word in s.lower().split()])
#为了计算『关键词』的有效性,我们可以naive地直接看『出现了多少次』(这里是简单统计str1在str2之中出现了多少次,更高级的就使用tf-idf)

def str_common_word(str1, str2):
    return sum(int(str2.find(word)>=0) for word in str1.split())
#接下来,把每一个column都跑一遍,以清洁所有的文本内容

df_all['search_term'] = df_all['search_term'].map(lambda x:str_stemmer(x)) #匿名函数,意思是将x(此列中的每个cell)中的每个词运行stem函数再返回
df_all['product_title'] = df_all['product_title'].map(lambda x:str_stemmer(x))
df_all['product_description'] = df_all['product_description'].map(lambda x:str_stemmer(x))
#Step 3: 自制文本特征
#一般属于一种脑洞大开的过程,想到什么可以加什么。

#当然,特征也不是越丰富越好,稍微靠谱点是肯定的。

#关键词的长度:
df_all['len_of_query'] = df_all['search_term'].map(lambda x:len(x.split())).astype(np.int64) 
#标题中有多少关键词重合
df_all['commons_in_title'] = df_all.apply(lambda x:str_common_word(x['search_term'],x['product_title']), axis=1)
#描述中有多少关键词重合
df_all['commons_in_desc'] = df_all.apply(lambda x:str_common_word(x['search_term'],x['product_description']), axis=1)
#等等等等。。变着法子想出些数字能代表的features,一股脑放进来~

#搞完之后,我们把不能被『机器学习模型』处理的column给drop掉(去掉文字,留下数字)

df_all = df_all.drop(['search_term','product_title','product_description'],axis=1)
#Step 4: 重塑训练/测试集
#舒淇说得好,要把之前脱下的衣服再一件件穿回来

#数据处理也是如此,搞完一圈预处理之后,我们让数据重回原本的样貌

#分开训练和测试集(刚才预处理是在一起处理的)
df_train = df_all.loc[df_train.index]
df_test = df_all.loc[df_test.index]
#记录下测试集的id
#留着上传的时候 能对的上号,将id取出

test_ids = df_test['id']
#分离出y_train,相关度就是y
y_train = df_train['relevance'].values
#把原集中的label给删去
#否则就是cheating了

X_train = df_train.drop(['id','relevance'],axis=1).values # 将y去掉,axis=1代表以列去除的
X_test = df_test.drop(['id','relevance'],axis=1).values
#Step 5: 建立模型
#我们用个最简单的模型:随机森林模型

from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import cross_val_score # 将训练集分成5份,将1份做训练,4份做测试,将测试的结果平均(交叉验证)
#用CV结果保证公正客观性;并调试不同的alpha值

params = [1,3,5,6,7,8,9,10] # 手写的网格搜索
test_scores = []
for param in params:
    clf = RandomForestRegressor(n_estimators=30, max_depth=param) # 最大深度为超参数
    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=5, scoring='neg_mean_squared_error')) 
    test_scores.append(np.mean(test_score)) # 四个结果平均
#画个图来看看:

import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(params, test_scores)
plt.title("Param vs CV Error");

#大概6~7的时候达到了最优解

#Step 6: 上传结果
#用我们测试出的最优解建立模型,并跑跑测试集

rf = RandomForestRegressor(n_estimators=30, max_depth=6)
rf.fit(X_train, y_train)
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=30, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
y_pred = rf.predict(X_test)
#把拿到的结果,放进PD,做成CSV上传:

pd.DataFrame({"id": test_ids, "relevance": y_pred}).to_csv('submission.csv',index=False)
#总结:
#这一篇教程中,虽然都是用的最简单的方法,但是基本框架是很完整的。

#同学们可以尝试修改/调试/升级的部分是:

#文本预处理步骤: 你可以使用很多不同的方法来使得文本数据变得更加清洁

#自制的特征: 相处更多的特征值表达方法(关键词全段重合数量,重合比率,等等)

#更好的回归模型: 根据之前的课讲的Ensemble方法,把分类器提升到极致

案例:Bags of Words Meets Bags of Popcorn

https://www.kaggle.com/ymanojkumar023/kumarmanoj-bag-of-words-meets-bags-of-popcorn

#import所需库
import os
import re
import numpy as np
import pandas as pd
​
from bs4 import BeautifulSoup
​
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier # 随机森林
from sklearn.metrics import confusion_matrix # 混淆矩阵
import nltk
#nltk.download()
from nltk.corpus import stopwords

# 用pandas读入训练数据
datafile = os.path.join('..', 'data', 'labeledTrainData.tsv')
df = pd.read_csv(datafile, sep='\t', escapechar='\\')
print('Number of reviews: {}'.format(len(df)))
df.head()
#Number of reviews: 25000
#id	sentiment	review
#0	5814_8	1	With all this stuff going down at the moment w...
#1	2381_9	1	"The Classic War of the Worlds" by Timothy Hin...
#2	7759_3	0	The film starts with a manager (Nicholas Bell)...
#3	3630_4	0	It must be assumed that those who praised this...
#4	9495_8	1	Superbly trashy and wondrously unpretentious 8...
#对影评数据做预处理,大概有以下环节:
#去掉html标签
#移除标点
#切分成词/token
#去掉停用词
#重组为新的句子
def display(text, title):
    print(title)
    print("\n----------我是分割线-------------\n")
    print(text) 
raw_example = df['review'][1]
display(raw_example, '原始数据')
'''
原始数据

----------我是分割线-------------

"The Classic War of the Worlds" by Timothy Hines is a very entertaining film that obviously goes to great effort and lengths to faithfully recreate H. G. Wells' classic book. Mr. Hines succeeds in doing so. I, and those who watched his film with me, appreciated the fact that it was not the standard, predictable Hollywood fare that comes out every year, e.g. the Spielberg version with Tom Cruise that had only the slightest resemblance to the book. Obviously, everyone looks for different things in a movie. Those who envision themselves as amateur "critics" look only to criticize everything they can. Others rate a movie on more important bases,like being entertained, which is why most people never agree with the "critics". We enjoyed the effort Mr. Hines put into being faithful to H.G. Wells' classic novel, and we found it to be very entertaining. This made it easy to overlook what the "critics" perceive to be its shortcomings.
'''
example = BeautifulSoup(raw_example, 'html.parser').get_text() # 使用bs解析文本
display(example, '去掉HTML标签的数据')
'''
去掉HTML标签的数据

----------我是分割线-------------

"The Classic War of the Worlds" by Timothy Hines is a very entertaining film that obviously goes to great effort and lengths to faithfully recreate H. G. Wells' classic book. Mr. Hines succeeds in doing so. I, and those who watched his film with me, appreciated the fact that it was not the standard, predictable Hollywood fare that comes out every year, e.g. the Spielberg version with Tom Cruise that had only the slightest resemblance to the book. Obviously, everyone looks for different things in a movie. Those who envision themselves as amateur "critics" look only to criticize everything they can. Others rate a movie on more important bases,like being entertained, which is why most people never agree with the "critics". We enjoyed the effort Mr. Hines put into being faithful to H.G. Wells' classic novel, and we found it to be very entertaining. This made it easy to overlook what the "critics" perceive to be its shortcomings.
'''
example_letters = re.sub(r'[^a-zA-Z]', ' ', example) # 使用正则表达式除掉出了字母之外的所有字符(换成空格)
display(example_letters, '去掉标点的数据')
'''
去掉标点的数据

----------我是分割线-------------

 The Classic War of the Worlds  by Timothy Hines is a very entertaining film that obviously goes to great effort and lengths to faithfully recreate H  G  Wells  classic book  Mr  Hines succeeds in doing so  I  and those who watched his film with me  appreciated the fact that it was not the standard  predictable Hollywood fare that comes out every year  e g  the Spielberg version with Tom Cruise that had only the slightest resemblance to the book  Obviously  everyone looks for different things in a movie  Those who envision themselves as amateur  critics  look only to criticize everything they can  Others rate a movie on more important bases like being entertained  which is why most people never agree with the  critics   We enjoyed the effort Mr  Hines put into being faithful to H G  Wells  classic novel  and we found it to be very entertaining  This made it easy to overlook what the  critics  perceive to be its shortcomings 
 '''
words = example_letters.lower().split() # 做一个小写化,并用空格分成一个list
display(words, '纯词列表数据')
'''
纯词列表数据

----------我是分割线-------------

[u'the', u'classic', u'war', u'of', u'the', u'worlds', u'by', u'timothy', u'hines', u'is', u'a', u'very', u'entertaining', u'film', u'that', u'obviously', u'goes', u'to', u'great', u'effort', u'and', u'lengths', u'to', u'faithfully', u'recreate', u'h', u'g', u'wells', u'classic', u'book', u'mr', u'hines', u'succeeds', u'in', u'doing', u'so', u'i', u'and', u'those', u'who', u'watched', u'his', u'film', u'with', u'me', u'appreciated', u'the', u'fact', u'that', u'it', u'was', u'not', u'the', u'standard', u'predictable', u'hollywood', u'fare', u'that', u'comes', u'out', u'every', u'year', u'e', u'g', u'the', u'spielberg', u'version', u'with', u'tom', u'cruise', u'that', u'had', u'only', u'the', u'slightest', u'resemblance', u'to', u'the', u'book', u'obviously', u'everyone', u'looks', u'for', u'different', u'things', u'in', u'a', u'movie', u'those', u'who', u'envision', u'themselves', u'as', u'amateur', u'critics', u'look', u'only', u'to', u'criticize', u'everything', u'they', u'can', u'others', u'rate', u'a', u'movie', u'on', u'more', u'important', u'bases', u'like', u'being', u'entertained', u'which', u'is', u'why', u'most', u'people', u'never', u'agree', u'with', u'the', u'critics', u'we', u'enjoyed', u'the', u'effort', u'mr', u'hines', u'put', u'into', u'being', u'faithful', u'to', u'h', u'g', u'wells', u'classic', u'novel', u'and', u'we', u'found', u'it', u'to', u'be', u'very', u'entertaining', u'this', u'made', u'it', u'easy', u'to', u'overlook', u'what', u'the', u'critics', u'perceive', u'to', u'be', u'its', u'shortcomings']
'''
#下载停用词和其他语料会用到
#nltk.download() # 可以直接使用nltk里的停用词
#words_nostop = [w for w in words if w not in stopwords.words('english')]
stopwords = {}.fromkeys([ line.rstrip() for line in open('../stopwords.txt')])
words_nostop = [w for w in words if w not in stopwords] # q去除非停用词
display(words_nostop, '去掉停用词数据')
'''
去掉停用词数据

----------我是分割线-------------

[u'classic', u'war', u'worlds', u'timothy', u'hines', u'entertaining', u'film', u'effort', u'lengths', u'faithfully', u'recreate', u'classic', u'book', u'hines', u'succeeds', u'watched', u'film', u'appreciated', u'standard', u'predictable', u'hollywood', u'fare', u'spielberg', u'version', u'tom', u'cruise', u'slightest', u'resemblance', u'book', u'movie', u'envision', u'amateur', u'critics', u'criticize', u'rate', u'movie', u'bases', u'entertained', u'people', u'agree', u'critics', u'enjoyed', u'effort', u'hines', u'faithful', u'classic', u'entertaining', u'easy', u'overlook', u'critics', u'perceive', u'shortcomings']
'''
#eng_stopwords = set(stopwords.words('english'))
eng_stopwords = set(stopwords)#整理到一个函数中
def clean_text(text):
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = text.lower().split()
    words = [w for w in words if w not in eng_stopwords]
    return ' '.join(words)
clean_text(raw_example)
'''
u'classic war worlds timothy hines entertaining film effort lengths faithfully recreate classic book hines succeeds watched film appreciated standard predictable hollywood fare spielberg version tom cruise slightest resemblance book movie envision amateur critics criticize rate movie bases entertained people agree critics enjoyed effort hines faithful classic entertaining easy overlook critics perceive shortcomings'
'''
# 清洗数据添加到dataframe里
df['clean_review'] = df.review.apply(clean_text) # 对每一行都做清洗处理
df.head()
'''
id	sentiment	review	clean_review
0	5814_8	1	With all this stuff going down at the moment w...	stuff moment mj ve started listening music wat...
1	2381_9	1	"The Classic War of the Worlds" by Timothy Hin...	classic war worlds timothy hines entertaining ...
2	7759_3	0	The film starts with a manager (Nicholas Bell)...	film starts manager nicholas bell investors ro...
3	3630_4	0	It must be assumed that those who praised this...	assumed praised film filmed opera didn read do...
4	9495_8	1	Superbly trashy and wondrously unpretentious 8...	superbly trashy wondrously unpretentious explo...
'''

#抽取bag of words特征(用sklearn的CountVectorizer)(参考之前的代码,这里是规定了一个含有5000个词的词向量)
vectorizer = CountVectorizer(max_features = 5000) 
train_data_features = vectorizer.fit_transform(df.clean_review).toarray()
train_data_features.shape
'''
(25000, 5000)
'''

# 随机森林训练分类器
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(train_data_features, df.sentiment)
#在训练集上做个predict看看效果如何
confusion_matrix(df.sentiment, forest.predict(train_data_features))
'''
array([[12500,     0],
       [    0, 12500]])
'''
       
#删除不用的占内容变量
del df
del train_data_features

#读取测试数据进行预测
datafile = os.path.join('..', 'data', 'testData.tsv')
df = pd.read_csv(datafile, sep='\t', escapechar='\\')
print('Number of reviews: {}'.format(len(df)))
df['clean_review'] = df.review.apply(clean_text)
df.head()
'''
Number of reviews: 25000
id	review	clean_review
0	12311_10	Naturally in a film who's main themes are of m...	naturally film main themes mortality nostalgia...
1	8348_2	This movie is a disaster within a disaster fil...	movie disaster within disaster film full great...
2	5828_4	All in all, this is a movie for kids. We saw i...	movie kids saw tonight child loved one point k...
3	7186_2	Afraid of the Dark left me with the impression...	afraid dark left impression several different ...
4	12128_7	A very accurate depiction of small time mob li...	accurate depiction small time mob life filmed ...
'''
test_data_features = vectorizer.transform(df.clean_review).toarray()
test_data_features.shape
'''
(25000, 5000)
'''
result = forest.predict(test_data_features)
output = pd.DataFrame({'id':df.id, 'sentiment':result})
output.head()
'''
id	sentiment
0	12311_10	1
1	8348_2	0
2	5828_4	1
3	7186_2	1
4	12128_7	1
'''
output.to_csv(os.path.join('..', 'data', 'Bag_of_Words_model.csv'), index=False)
del df
del test_data_features



#word2vec训练词向量
import os
import re
import numpy as np
import pandas as pd
​
from bs4 import BeautifulSoup
​
import nltk.data
#nltk.download()
#from nltk.corpus import stopwordsfrom gensim.models.word2vec import Word2Vec
def load_dataset(name, nrows=None):
    datasets = {
        'unlabeled_train': 'unlabeledTrainData.tsv',
        'labeled_train': 'labeledTrainData.tsv',
        'test': 'testData.tsv'
    }
    if name not in datasets:
        raise ValueError(name)
    data_file = os.path.join('..', 'data', datasets[name])
    df = pd.read_csv(data_file, sep='\t', escapechar='\\', nrows=nrows)
    print('Number of reviews: {}'.format(len(df)))
    return df
    
#读入无标签数据
#用于训练生成word2vec词向量

df = load_dataset('unlabeled_train')
df.head()
'''
Number of reviews: 50000
id	review
0	9999_0	Watching Time Chasers, it obvious that it was ...
1	45057_0	I saw this film about 20 years ago and remembe...
2	15561_0	Minor Spoilers

In New York, Joan Ba... 3 7161_0 I went to see this film with a great deal of e... 4 43971_0 Yes, I agree with everyone on this site this m... '''
#和第一个ipython notebook一样做数据的预处理 #稍稍有一点不一样的是,我们留了个候选,可以去除停用词,也可以不去除停用词 #eng_stopwords = set(stopwords.words('english')) eng_stopwords = {}.fromkeys([ line.rstrip() for line in open('../stopwords.txt')])def clean_text(text, remove_stopwords=False): text = BeautifulSoup(text, 'html.parser').get_text() text = re.sub(r'[^a-zA-Z]', ' ', text) words = text.lower().split() if remove_stopwords: words = [w for w in words if w not in eng_stopwords] return words ​ tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')def print_call_counts(f): n = 0 def wrapped(*args, **kwargs): nonlocal n n += 1 if n % 1000 == 1: print('method {} called {} times'.format(f.__name__, n)) return f(*args, **kwargs) return wrapped ​ @print_call_counts def split_sentences(review): raw_sentences = tokenizer.tokenize(review.strip()) sentences = [clean_text(s) for s in raw_sentences if s] return sentences %time sentences = sum(df.review.apply(split_sentences), []) print('{} reviews -> {} sentences'.format(len(df), len(sentences))) #用gensim训练词嵌入模型 import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # 设定词向量训练的参数,维度尽量300以上 num_features = 300 # Word vector dimensionality min_word_count = 40 # Minimum word count num_workers = 4 # Number of threads to run in parallel 线程 context = 10 # Context window size 上下文窗口大小 downsampling = 1e-3 # Downsample setting for frequent words ​ model_name = '{}features_{}minwords_{}context.model'.format(num_features, min_word_count, context) print('Training model...') model = word2vec.Word2Vec(sentences, workers=num_workers, \ size=num_features, min_count = min_word_count, \ window = context, sample = downsampling)# If you don't plan to train the model any further, calling # init_sims will make the model much more memory-efficient. model.init_sims(replace=True)# It can be helpful to create a meaningful model name and # save the model for later use. You can load it later using Word2Vec.load() model.save(os.path.join('..', 'models', model_name)) Training model... # 看看训练的词向量结果如何 print(model.doesnt_match("man woman child kitchen".split())) print(model.doesnt_match('france england germany berlin'.split())) kitchen berlin model.most_similar("man") ''' [('woman', 0.6256189346313477), ('lady', 0.5953349471092224), ('lad', 0.576863169670105), ('person', 0.5407935380935669), ('farmer', 0.5382746458053589), ('chap', 0.536788821220398), ('soldier', 0.5292650461196899), ('men', 0.5261573791503906), ('monk', 0.5237958431243896), ('guy', 0.5213091373443604)] ''' model.most_similar("queen") ''' [('princess', 0.6749982833862305), ('maid', 0.6223365068435669), ('bride', 0.6201028227806091), ('belle', 0.6200867891311646), ('temple', 0.6171057224273682), ('stripper', 0.608874499797821), ('catherine', 0.6072724461555481), ('eva', 0.6019693613052368), ('dancer', 0.594109833240509), ('sylvia', 0.5933606624603271)] ''' model.most_similar("awful") ''' [('terrible', 0.7551683187484741), ('atrocious', 0.7340768575668335), ('horrible', 0.7315883040428162), ('dreadful', 0.7080680131912231), ('abysmal', 0.7010548114776611), ('horrendous', 0.6951696872711182), ('appalling', 0.691646933555603), ('horrid', 0.6708598136901855), ('amateurish', 0.6481891870498657), ('embarrassing', 0.6306308507919312)] ''' #在word2vec上训练情感分析模型 import os import re import numpy as np import pandas as pd ​ from bs4 import BeautifulSoup ​ #from nltk.corpus import stopwordsfrom gensim.models.word2vec import Word2Vec ​ from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import confusion_matrix from sklearn.cluster import KMeans #和之前的操作一致 def load_dataset(name, nrows=None): datasets = { 'unlabeled_train': 'unlabeledTrainData.tsv', 'labeled_train': 'labeledTrainData.tsv', 'test': 'testData.tsv' } if name not in datasets: raise ValueError(name) data_file = os.path.join('..', 'data', datasets[name]) df = pd.read_csv(data_file, sep='\t', escapechar='\\', nrows=nrows) print('Number of reviews: {}'.format(len(df))) return df eng_stopwords = set(stopwords.words('english'))def clean_text(text, remove_stopwords=False): text = BeautifulSoup(text, 'html.parser').get_text() text = re.sub(r'[^a-zA-Z]', ' ', text) words = text.lower().split() if remove_stopwords: words = [w for w in words if w not in eng_stopwords] return words #读入之前训练好的Word2Vec模型 model_name = '300features_40minwords_10context.model' model = Word2Vec.load(os.path.join('..', 'models', model_name)) #我们可以根据word2vec的结果去对影评文本进行编码 #编码方式有一点粗暴,简单说来就是把这句话中的词的词向量做平均 df = load_dataset('labeled_train') df.head() ''' Number of reviews: 25000 id sentiment review 0 5814_8 1 With all this stuff going down at the moment w... 1 2381_9 1 "The Classic War of the Worlds" by Timothy Hin... 2 7759_3 0 The film starts with a manager (Nicholas Bell)... 3 3630_4 0 It must be assumed that those who praised this... 4 9495_8 1 Superbly trashy and wondrously unpretentious 8... ''' def to_review_vector(review): words = clean_text(review, remove_stopwords=True) array = np.array([model[w] for w in words if w in model]) return pd.Series(array.mean(axis=0)) train_data_features = df.review.apply(to_review_vector) train_data_features.head() ''' 0 1 2 3 4 5 6 7 8 9 ... 290 291 292 293 294 295 296 297 298 299 0 -0.005454 -0.006304 0.019711 0.002696 -0.009633 -0.007165 -0.000097 0.012100 0.023512 0.005115 ... -0.003228 -0.000991 -0.002044 0.005908 0.005332 0.005427 0.004184 -0.007288 0.027719 0.011447 1 -0.011847 -0.002713 0.041218 -0.018987 -0.018241 -0.021992 -0.027039 0.023703 0.054601 0.004215 ... -0.024287 0.010150 -0.012596 -0.016019 0.000792 -0.002985 -0.009311 -0.011830 0.014108 0.022902 2 -0.028175 0.001474 0.008125 -0.019340 -0.038524 -0.017802 -0.031166 0.000145 0.038809 0.003583 ... 0.002452 0.004443 -0.015119 0.010723 -0.011887 0.021536 0.013621 -0.013268 0.019888 0.003641 3 -0.024626 -0.006715 0.032918 -0.020560 -0.037079 -0.021495 -0.022226 -0.006984 0.047868 0.006594 ... -0.002942 0.017494 -0.016277 -0.006731 0.000734 0.011033 -0.004642 0.004115 0.013974 0.013784 4 -0.019951 -0.002109 0.010210 -0.016458 -0.034194 -0.019208 -0.000223 -0.006509 0.024472 0.006015 ... 0.002908 0.004384 -0.006123 0.007581 -0.006920 0.019001 0.009619 -0.007976 0.020669 -0.004658 5 rows × 300 columns ''' #用随机森林构建分类器 forest = RandomForestClassifier(n_estimators = 100, random_state=42) forest = forest.fit(train_data_features, df.sentiment) #同样在训练集上试试,确保模型能正常work confusion_matrix(df.sentiment, forest.predict(train_data_features)) ''' array([[12500, 0], [ 0, 12500]]) ''' #清理占用内容的变量 del df del train_data_features #预测测试集结果并上传kaggle df = load_dataset('test') df.head() ''' Number of reviews: 25000 id review 0 12311_10 Naturally in a film who's main themes are of m... 1 8348_2 This movie is a disaster within a disaster fil... 2 5828_4 All in all, this is a movie for kids. We saw i... 3 7186_2 Afraid of the Dark left me with the impression... 4 12128_7 A very accurate depiction of small time mob li... ''' test_data_features = df.review.apply(to_review_vector) test_data_features.head() 0 1 2 3 4 5 6 7 8 9 ... 290 291 292 293 294 295 296 297 298 299 0 -0.019753 -0.005689 0.015961 -0.038633 -0.041745 -0.044680 -0.012790 0.004908 0.053838 0.008490 ... -0.005520 0.034378 -0.027250 0.010244 -0.008976 0.010181 -0.027196 0.010429 0.021153 0.015764 1 0.000497 -0.004140 0.019237 0.011341 -0.020860 -0.013085 -0.005469 0.015154 0.022737 0.009717 ... 0.005757 0.018115 -0.010495 -0.007650 0.000969 0.018796 -0.003173 0.001657 0.014491 0.026732 2 -0.015999 -0.012097 0.022069 -0.014368 -0.020226 -0.015809 -0.000826 0.010130 0.033976 0.005700 ... 0.001799 0.012403 -0.022812 0.011651 0.001775 0.009241 0.003241 -0.002865 0.027701 0.028418 3 -0.015196 -0.013445 0.010499 -0.035669 -0.040131 -0.018273 -0.020452 -0.003197 0.026555 0.008284 ... 0.011720 0.010397 -0.029256 0.007422 -0.000662 0.020593 0.001274 -0.014059 0.024905 0.024326 4 -0.016140 -0.015608 0.010962 -0.008424 -0.022619 -0.022396 -0.018043 0.012519 0.032103 0.009743 ... -0.001820 0.004578 -0.008875 0.009702 -0.012013 0.010689 -0.003468 -0.003109 0.026661 0.005735 5 rows × 300 columns result = forest.predict(test_data_features) output = pd.DataFrame({'id':df.id, 'sentiment':result}) output.to_csv(os.path.join('..', 'data', 'Word2Vec_model.csv'), index=False) output.head() ''' id sentiment 0 12311_10 1 1 8348_2 0 2 5828_4 0 3 7186_2 0 4 12128_7 1 ''' del df del test_data_features del forest #对词向量进行聚类研究和编码 #使用Kmeans进行聚类 word_vectors = model.syn0 num_clusters = word_vectors.shape[0] // 10 %%time ​ kmeans_clustering = KMeans(n_clusters = num_clusters, n_jobs=4) idx = kmeans_clustering.fit_predict(word_vectors) ''' CPU times: user 2.03 s, sys: 377 ms, total: 2.41 s Wall time: 13min 19s ''' word_centroid_map = dict(zip(model.index2word, idx)) import pickle ​ filename = 'word_centroid_map_10avg.pickle' with open(os.path.join('..', 'models', filename), 'bw') as f: pickle.dump(word_centroid_map, f) #with open(os.path.join('..', 'models', filename), 'br') as f: # word_centroid_map = pickle.load(f) 输出一些clusters看 for cluster in range(0,10): print("\nCluster %d" % cluster) print([w for w,c in word_centroid_map.items() if c == cluster]) ''' Cluster 0 ['praised', 'appreciated', 'noted', 'avoided', 'criticized', 'admired'] Cluster 1 ['misfit', 'con', 'hoodlum', 'spy', 'rogue'] Cluster 2 ['contrasts', 'healthy', 'glamour', 'eroticism', 'sensual'] Cluster 3 ['matthew', 'kingsley', 'klein', 'hackman', 'meyers', 'perry', 'simpson', 'pullman', 'dana', 'olsen', 'ryan', 'barrie', 'caan', 'tho', 'farina', 'stiller', 'hutton', 'sparks', 'lillard', 'broderick', 'kline', 'reprise', 'mcconaughey', 'carvey', 'harrelson'] Cluster 4 ['wolves', 'papillon', 'continent'] Cluster 5 ['tick', 'drain', 'nailed', 'puke', 'boil', 'stalk'] Cluster 6 ['cotton', 'denver', 'windsor', 'marsh', 'bell'] Cluster 7 ['lighting', 'costumes', 'sfx', 'props', 'design', 'costuming', 'designs', 'makeup'] Cluster 8 ['decline', 'swashbuckling', 'swashbuckler', 'prestige', 'potboiler', 'latter', 'glory', 'untouchables', 'fame'] Cluster 9 ['slashed', 'butchered', 'mutilated', 'eaten', 'slaughtered', 'continually'] ''' #把评论数据转成cluster bag vectors wordset = set(word_centroid_map.keys())def make_cluster_bag(review): words = clean_text(review, remove_stopwords=True) return (pd.Series([word_centroid_map[w] for w in words if w in wordset]) .value_counts() .reindex(range(num_clusters+1), fill_value=0)) df = load_dataset('labeled_train') df.head() ''' Number of reviews: 25000 id sentiment review 0 5814_8 1 With all this stuff going down at the moment w... 1 2381_9 1 "The Classic War of the Worlds" by Timothy Hin... 2 7759_3 0 The film starts with a manager (Nicholas Bell)... 3 3630_4 0 It must be assumed that those who praised this... 4 9495_8 1 Superbly trashy and wondrously unpretentious 8... ''' train_data_features = df.review.apply(make_cluster_bag) train_data_features.head() ''' 0 1 2 3 4 5 6 7 8 9 ... 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 0 0 0 0 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 1 0 0 0 ... 0 1 0 0 0 0 0 0 0 0 3 1 0 0 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 0 0 1 ... 0 5 0 0 0 0 0 0 0 0 5 rows × 1306 columns ''' #再用随机森林算法建模¶ forest = RandomForestClassifier(n_estimators = 100, random_state=42) forest = forest.fit(train_data_features, df.sentiment) #在训练集上试一试效果 confusion_matrix(df.sentiment, forest.predict(train_data_features)) ''' array([[12500, 0], [ 0, 12500]]) ''' #去掉无用的占内存的量 del df del train_data_features #载入测试数据做预测 df = load_dataset('test') df.head() ''' Number of reviews: 25000 id review 0 12311_10 Naturally in a film who's main themes are of m... 1 8348_2 This movie is a disaster within a disaster fil... 2 5828_4 All in all, this is a movie for kids. We saw i... 3 7186_2 Afraid of the Dark left me with the impression... 4 12128_7 A very accurate depiction of small time mob li... ''' test_data_features = df.review.apply(make_cluster_bag) test_data_features.head() ''' 0 1 2 3 4 5 6 7 8 9 ... 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 ... 0 0 1 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 5 rows × 1306 columns ''' result = forest.predict(test_data_features) output = pd.DataFrame({'id':df.id, 'sentiment':result}) output.to_csv(os.path.join('..', 'data', 'Word2Vec_BagOfClusters.csv'), index=False) output.head() ''' id sentiment 0 12311_10 1 1 8348_2 0 2 5828_4 1 3 7186_2 0 4 12128_7 1 ''' del df del test_data_features del forest del df del test_data_features del forest

案例:中文数据构建与工具

载入所需的库
我们依旧会用gensim去做word2vec的处理,会用sklearn当中的SVM进行建模

# -*- coding: utf-8 -*-
"""
Created on 2015.12.09
​
@author: Hanxiaoyang
"""
from sklearn.cross_validation import train_test_split # 数据集分割
from gensim.models.word2vec import Word2Vec #构建词向量
import numpy as np
import pandas as pd
import jieba # jieba
from sklearn.externals import joblib # 数据转换二进制
from sklearn.svm import SVC
import sys  
reload(sys)  
sys.setdefaultencoding('utf8')

# 载入数据,做预处理(分词),切分训练集与测试集
def load_file_and_preprocessing():
    neg=pd.read_excel('data/neg.xls',header=None,index=None) #负样本
    pos=pd.read_excel('data/pos.xls',header=None,index=None) # 正样本
​
    cw = lambda x: list(jieba.cut(x)) # 分词
    pos['words'] = pos[0].apply(cw)
    neg['words'] = neg[0].apply(cw)#print pos['words']
    #use 1 for positive sentiment, 0 for negative
    y = np.concatenate((np.ones(len(pos)), np.zeros(len(neg))))
​
    x_train, x_test, y_train, y_test = train_test_split(np.concatenate((pos['words'], neg['words'])), y, test_size=0.2) #训练集分割
    
    np.save('svm_data/y_train.npy',y_train)
    np.save('svm_data/y_test.npy',y_test)
    return x_train,x_test
    
#对每个句子的所有词向量取均值,来生成一个句子的vector
# 还可以使用TF-IDF
def build_sentence_vector(text, size,imdb_w2v): 
    vec = np.zeros(size).reshape((1, size)) #
    count = 0.
    for word in text:
        try:
            vec += imdb_w2v[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count #取平均
    return vec
    
    
#计算词向量
def get_train_vecs(x_train,x_test):
    n_dim = 300
    #初始化模型和词表
    imdb_w2v = Word2Vec(size=n_dim, min_count=10)
    imdb_w2v.build_vocab(x_train)
    
    #在评论训练集上建模(可能会花费几分钟)
    imdb_w2v.train(x_train)
    
    train_vecs = np.concatenate([build_sentence_vector(z, n_dim,imdb_w2v) for z in x_train])
    #train_vecs = scale(train_vecs)
    
    np.save('svm_data/train_vecs.npy',train_vecs)
    print train_vecs.shape
    #在测试集上训练
    imdb_w2v.train(x_test)
    imdb_w2v.save('svm_data/w2v_model/w2v_model.pkl')
    #Build test tweet vectors then scale
    test_vecs = np.concatenate([build_sentence_vector(z, n_dim,imdb_w2v) for z in x_test])
    #test_vecs = scale(test_vecs)
    np.save('svm_data/test_vecs.npy',test_vecs)
    print test_vecs.shape
def get_data():
    train_vecs=np.load('svm_data/train_vecs.npy')
    y_train=np.load('svm_data/y_train.npy')
    test_vecs=np.load('svm_data/test_vecs.npy')
    y_test=np.load('svm_data/y_test.npy') 
    return train_vecs,y_train,test_vecs,y_test
    
#训练svm模型¶
def svm_train(train_vecs,y_train,test_vecs,y_test):
    clf=SVC(kernel='rbf',verbose=True)
    clf.fit(train_vecs,y_train)
    joblib.dump(clf, 'svm_data/svm_model/model.pkl')
    print clf.score(test_vecs,y_test)
    
#构建待预测句子的向量
def get_predict_vecs(words):
    n_dim = 300
    imdb_w2v = Word2Vec.load('svm_data/w2v_model/w2v_model.pkl')
    #imdb_w2v.train(words)
    train_vecs = build_sentence_vector(words, n_dim,imdb_w2v)
    #print train_vecs.shape
    return train_vecs
    
#对单个句子进行情感判断
def svm_predict(string):
    words=jieba.lcut(string)
    words_vecs=get_predict_vecs(words)
    clf=joblib.load('svm_data/svm_model/model.pkl')
     
    result=clf.predict(words_vecs)
    
    if int(result[0])==1:
        print string,' positive'
    else:
        print string,' negative'
        
##对输入句子情感进行判断
string='电池充完了电连手机都打不开.简直烂的要命.真是金玉其外,败絮其中!连5号电池都不如'
#string='牛逼的手机,从3米高的地方摔下去都没坏,质量非常好'    
svm_predict(string)

案例:每日新闻

用每日新闻预测金融市场变化(进阶版)¶
Kaggle竞赛:https://www.kaggle.com/aaron7sun/stocknews

这篇教程里,我们会学习如何更有逼格地使用word2vec

import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from datetime import date
监视数据
我们先读入数据。这里我提供了一个已经combine好了的数据。

data = pd.read_csv('../input/Combined_News_DJIA.csv')
这时候,我们可以看一下数据长什么样子

data.head()
'''
Date	Label	Top1	Top2	Top3	Top4	Top5	Top6	Top7	Top8	...	Top16	Top17	Top18	Top19	Top20	Top21	Top22	Top23	Top24	Top25
0	2008-08-08	0	b"Georgia 'downs two Russian warplanes' as cou...	b'BREAKING: Musharraf to be impeached.'	b'Russia Today: Columns of troops roll into So...	b'Russian tanks are moving towards the capital...	b"Afghan children raped with 'impunity,' U.N. ...	b'150 Russian tanks have entered South Ossetia...	b"Breaking: Georgia invades South Ossetia, Rus...	b"The 'enemy combatent' trials are nothing but...	...	b'Georgia Invades South Ossetia - if Russia ge...	b'Al-Qaeda Faces Islamist Backlash'	b'Condoleezza Rice: "The US would not act to p...	b'This is a busy day: The European Union has ...	b"Georgia will withdraw 1,000 soldiers from Ir...	b'Why the Pentagon Thinks Attacking Iran is a ...	b'Caucasus in crisis: Georgia invades South Os...	b'Indian shoe manufactory - And again in a se...	b'Visitors Suffering from Mental Illnesses Ban...	b"No Help for Mexico's Kidnapping Surge"
1	2008-08-11	1	b'Why wont America and Nato help us? If they w...	b'Bush puts foot down on Georgian conflict'	b"Jewish Georgian minister: Thanks to Israeli ...	b'Georgian army flees in disarray as Russians ...	b"Olympic opening ceremony fireworks 'faked'"	b'What were the Mossad with fraudulent New Zea...	b'Russia angered by Israeli military sale to G...	b'An American citizen living in S.Ossetia blam...	...	b'Israel and the US behind the Georgian aggres...	b'"Do not believe TV, neither Russian nor Geor...	b'Riots are still going on in Montreal (Canada...	b'China to overtake US as largest manufacturer'	b'War in South Ossetia [PICS]'	b'Israeli Physicians Group Condemns State Tort...	b' Russia has just beaten the United States ov...	b'Perhaps *the* question about the Georgia - R...	b'Russia is so much better at war'	b"So this is what it's come to: trading sex fo...
2	2008-08-12	0	b'Remember that adorable 9-year-old who sang a...	b"Russia 'ends Georgia operation'"	b'"If we had no sexual harassment we would hav...	b"Al-Qa'eda is losing support in Iraq because ...	b'Ceasefire in Georgia: Putin Outmaneuvers the...	b'Why Microsoft and Intel tried to kill the XO...	b'Stratfor: The Russo-Georgian War and the Bal...	b"I'm Trying to Get a Sense of This Whole Geor...	...	b'U.S. troops still in Georgia (did you know t...	b'Why Russias response to Georgia was right'	b'Gorbachev accuses U.S. of making a "serious ...	b'Russia, Georgia, and NATO: Cold War Two'	b'Remember that adorable 62-year-old who led y...	b'War in Georgia: The Israeli connection'	b'All signs point to the US encouraging Georgi...	b'Christopher King argues that the US and NATO...	b'America: The New Mexico?'	b"BBC NEWS | Asia-Pacific | Extinction 'by man...
3	2008-08-13	0	b' U.S. refuses Israel weapons to attack Iran:...	b"When the president ordered to attack Tskhinv...	b' Israel clears troops who killed Reuters cam...	b'Britain\'s policy of being tough on drugs is...	b'Body of 14 year old found in trunk; Latest (...	b'China has moved 10 *million* quake survivors...	b"Bush announces Operation Get All Up In Russi...	b'Russian forces sink Georgian ships '	...	b'Elephants extinct by 2020?'	b'US humanitarian missions soon in Georgia - i...	b"Georgia's DDOS came from US sources"	b'Russian convoy heads into Georgia, violating...	b'Israeli defence minister: US against strike ...	b'Gorbachev: We Had No Choice'	b'Witness: Russian forces head towards Tbilisi...	b' Quarter of Russians blame U.S. for conflict...	b'Georgian president says US military will ta...	b'2006: Nobel laureate Aleksander Solzhenitsyn...
4	2008-08-14	1	b'All the experts admit that we should legalis...	b'War in South Osetia - 89 pictures made by a ...	b'Swedish wrestler Ara Abrahamian throws away ...	b'Russia exaggerated the death toll in South O...	b'Missile That Killed 9 Inside Pakistan May Ha...	b"Rushdie Condemns Random House's Refusal to P...	b'Poland and US agree to missle defense deal. ...	b'Will the Russians conquer Tblisi? Bet on it,...	...	b'Bank analyst forecast Georgian crisis 2 days...	b"Georgia confict could set back Russia's US r...	b'War in the Caucasus is as much the product o...	b'"Non-media" photos of South Ossetia/Georgia ...	b'Georgian TV reporter shot by Russian sniper ...	b'Saudi Arabia: Mother moves to block child ma...	b'Taliban wages war on humanitarian aid workers'	b'Russia: World "can forget about" Georgia\'s...	b'Darfur rebels accuse Sudan of mounting major...	b'Philippines : Peace Advocate say Muslims nee...
5 rows × 27 columns
'''
其实看起来特别的简单直观。如果是1,那么当日的DJIA就提高或者不变了。如果是1,那么DJIA那天就是跌了。

分割测试/训练集
这下,我们可以先把数据给分成Training/Testing data

train = data[data['Date'] < '2015-01-01']
test = data[data['Date'] > '2014-12-31']
然后,我们把每条新闻做成一个单独的句子,集合在一起:

X_train = train[train.columns[2:]]
corpus = X_train.values.flatten().astype(str)

X_train = X_train.values.astype(str)
X_train = np.array([' '.join(x) for x in X_train])
X_test = test[test.columns[2:]]
X_test = X_test.values.astype(str)
X_test = np.array([' '.join(x) for x in X_test])
y_train = train['Label'].values
y_test = test['Label'].values
这里我们注意,我们需要三样东西:

corpus是全部我们『可见』的文本资料。我们假设每条新闻就是一句话,把他们全部flatten()了,我们就会得到list of sentences。

同时我们的X_train和X_test可不能随便flatten,他们需要与y_train和y_test对应。

corpus[:3]
array([ 'b"Georgia \'downs two Russian warplanes\' as countries move to brink of war"',
       "b'BREAKING: Musharraf to be impeached.'",
       "b'Russia Today: Columns of troops roll into South Ossetia; footage from fighting (YouTube)'"], 
      dtype=')
X_train[:1]
'''
array([ 'b"Georgia \'downs two Russian warplanes\' as countries move to brink of war" b\'BREAKING: Musharraf to be impeached.\' b\'Russia Today: Columns of troops roll into South Ossetia; footage from fighting (YouTube)\' b\'Russian tanks are moving towards the capital of South Ossetia, which has reportedly been completely destroyed by Georgian artillery fire\' b"Afghan children raped with \'impunity,\' U.N. official says - this is sick, a three year old was raped and they do nothing" b\'150 Russian tanks have entered South Ossetia whilst Georgia shoots down two Russian jets.\' b"Breaking: Georgia invades South Ossetia, Russia warned it would intervene on SO\'s side" b"The \'enemy combatent\' trials are nothing but a sham: Salim Haman has been sentenced to 5 1/2 years, but will be kept longer anyway just because they feel like it." b\'Georgian troops retreat from S. Osettain capital, presumably leaving several hundred people killed. [VIDEO]\' b\'Did the U.S. Prep Georgia for War with Russia?\' b\'Rice Gives Green Light for Israel to Attack Iran: Says U.S. has no veto over Israeli military ops\' b\'Announcing:Class Action Lawsuit on Behalf of American Public Against the FBI\' b"So---Russia and Georgia are at war and the NYT\'s top story is opening ceremonies of the Olympics?  What a fucking disgrace and yet further proof of the decline of journalism." b"China tells Bush to stay out of other countries\' affairs" b\'Did World War III start today?\' b\'Georgia Invades South Ossetia - if Russia gets involved, will NATO absorb Georgia and unleash a full scale war?\' b\'Al-Qaeda Faces Islamist Backlash\' b\'Condoleezza Rice: "The US would not act to prevent an Israeli strike on Iran." Israeli Defense Minister Ehud Barak: "Israel is prepared for uncompromising victory in the case of military hostilities."\' b\'This is a busy day:  The European Union has approved new sanctions against Iran in protest at its nuclear programme.\' b"Georgia will withdraw 1,000 soldiers from Iraq to help fight off Russian forces in Georgia\'s breakaway region of South Ossetia" b\'Why the Pentagon Thinks Attacking Iran is a Bad Idea - US News & World Report\' b\'Caucasus in crisis: Georgia invades South Ossetia\' b\'Indian shoe manufactory  - And again in a series of "you do not like your work?"\' b\'Visitors Suffering from Mental Illnesses Banned from Olympics\' b"No Help for Mexico\'s Kidnapping Surge"'], 
      dtype='
y_train[:5]
array([0, 1, 0, 0, 1])
来,我们再把每个单词给分隔开:

同样,corpus和X_train的处理不同

from nltk.tokenize import word_tokenize

corpus = [word_tokenize(x) for x in corpus]
X_train = [word_tokenize(x) for x in X_train]
X_test = [word_tokenize(x) for x in X_test]
tokenize完毕后,

我们可以看到,虽然corpus和x都是一个二维数组,但是他们的意义不同了。

corpus里,第二维数据是一个个句子。

x里,第二维数据是一个个数据点(对应每个label)


预处理
我们进行一些预处理来把我们的文本资料变得更加统一:

小写化

删除停止词

删除数字与符号

lemma

我们把这些功能合为一个func:

# 停止词
from nltk.corpus import stopwords
stop = stopwords.words('english')

# 数字
import re
def hasNumbers(inputString):
    return bool(re.search(r'\d', inputString))

# 特殊符号
def isSymbol(inputString):
    return bool(re.match(r'[^\w]', inputString))

# lemma
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def check(word):
    """
    如果需要这个单词,则True
    如果应该去除,则False
    """
    word= word.lower()
    if word in stop:
        return False
    elif hasNumbers(word) or isSymbol(word):
        return False
    else:
        return True

# 把上面的方法综合起来
def preprocessing(sen):
    res = []
    for word in sen:
        if check(word):
            # 这一段的用处仅仅是去除python里面byte存str时候留下的标识。。之前数据没处理好,其他case里不会有这个情况
            word = word.lower().replace("b'", '').replace('b"', '').replace('"', '').replace("'", '')
            res.append(wordnet_lemmatizer.lemmatize(word))
    return res
把我们三个数据组都来处理一下:

corpus = [preprocessing(x) for x in corpus]
X_train = [preprocessing(x) for x in X_train]
X_test = [preprocessing(x) for x in X_test]
我们再来看看处理之后的数据长相:

print(corpus[553])
print(X_train[523])
['north', 'korean', 'leader', 'kim', 'jong-il', 'confirmed', 'ill']
['two', 'redditors', 'climbing', 'mt', 'kilimanjaro', 'charity', 'bidding', 'peak', 'nt', 'squander', 'opportunity', 'let', 'upvotes', 'something', 'awesome', 'estimated', 'take', 'year', 'clear', 'lao', 'explosive', 'remnant', 'left', 'behind', 'united', 'state', 'bomber', 'year', 'ago', 'people', 'died', 'unexploded', 'ordnance', 'since', 'conflict', 'ended', 'fidel', 'ahmadinejad', 'slandering', 'jew', 'mossad', 'america', 'israel', 'intelligence', 'agency', 'target', 'united', 'state', 'intensively', 'among', 'nation', 'considered', 'friendly', 'washington', 'israel', 'lead', 'others', 'active', 'espionage', 'directed', 'american', 'company', 'defense', 'department', 'australian', 'election', 'day', 'poll', 'rural/regional', 'independent', 'member', 'parliament', 'support', 'labor', 'minority', 'goverment', 'julia', 'gillard', 'prime', 'minister', 'france', 'plan', 'raise', 'retirement', 'age', 'set', 'strike', 'britain', 'parliament', 'police', 'murdoch', 'paper', 'adviser', 'pm', 'implicated', 'voicemail', 'hacking', 'scandal', 'british', 'policeman', 'jailed', 'month', 'cell', 'attack', 'woman', 'rest', 'email', 'display', 'fundemental', 'disdain', 'pluralistic', 'america', 'reveals', 'chilling', 'level', 'islamophobia', 'hatemongering', 'church', 'plan', 'burn', 'quran', 'endanger', 'troop', 'u', 'commander', 'warns', 'freed', 'journalist', 'tricked', 'captor', 'twitter', 'access', 'manila', 'water', 'crisis', 'expose', 'impact', 'privatisation', 'july', 'week-long', 'rationing', 'water', 'highlighted', 'reality', 'million', 'people', 'denied', 'basic', 'right', 'potable', 'water', 'sanitation', 'private', 'firm', 'rake', 'profit', 'expense', 'weird', 'uk', 'police', 'ask', 'help', 'case', 'slain', 'intelligence', 'agent', 'greenpeace', 'japan', 'anti-whaling', 'activist', 'found', 'guilty', 'theft', 'captured', 'journalist', 'trick', 'captor', 'revealing', 'alive', 'creepy', 'biometric', 'id', 'forced', 'onto', 'india', 'billion', 'inhabitant', 'fear', 'loss', 'privacy', 'government', 'abuse', 'abound', 'india', 'gear', 'biometrically', 'identify', 'number', 'billion', 'inhabitant', 'china', 'young', 'officer', 'syndrome', 'china', 'military', 'spending', 'growing', 'fast', 'overtaken', 'strategy', 'said', 'professor', 'huang', 'jing', 'school', 'public', 'policy', 'young', 'officer', 'taking', 'control', 'strategy', 'like', 'young', 'officer', 'japan', 'mexican', 'soldier', 'open', 'fire', 'family', 'car', 'military', 'checkpoint', 'killing', 'father', 'son', 'death', 'toll', 'continues', 'climb', 'guatemala', 'landslide', 'foreign', 'power', 'stop', 'interfering', 'case', 'iranian', 'woman', 'sentenced', 'death', 'stoning', 'iran', 'foreign', 'ministry', 'said', 'mexican', 'official', 'gunman', 'behind', 'massacre', 'killed', 'tv', 'anchor', 'stabbed', 'death', 'outside', 'kabul', 'home', 'mosque', 'menace', 'confined', 'lower', 'manhattan', 'many', 'european', 'country', 'similar', 'alarm', 'sounded', 'muslim', 'coming', 'french', 'citizen', 'barred', 'american', 'military', 'base', 'dutch', 'neo-nazi', 'donates', 'sperm', 'white', 'dutch', 'neo-nazi', 'offered', 'donate', 'sperm', 'four', 'fertility', 'clinic', 'netherlands', 'effort', 'promote', 'call', 'strong', 'white', 'race']
训练NLP模型
有了这些干净的数据集,我们可以做我们的NLP模型了。

我们先用最简单的Word2Vec

from gensim.models.word2vec import Word2Vec

model = Word2Vec(corpus, size=128, window=5, min_count=5, workers=4)
这时候,每个单词都可以像查找字典一样,读出他们的w2v坐标了:

model['ok']
array([-0.29960674,  0.03145241,  0.00570022,  0.09868251, -0.17285152,
        0.01856422, -0.01089751,  0.15153641,  0.17857222, -0.03622751,
        0.10417395, -0.0260475 ,  0.08195975, -0.06125315,  0.04687231,
        0.05752773,  0.12925589,  0.07877159, -0.13440445,  0.20191686,
        0.12656711, -0.06969397,  0.02447173, -0.02880211,  0.10401903,
        0.14345747, -0.07248937,  0.18151827,  0.04464363,  0.14008987,
        0.02799574,  0.1358372 ,  0.10382857,  0.11229188, -0.0558577 ,
        0.02732387,  0.0209927 , -0.09975895, -0.08367401, -0.05347675,
        0.0048474 ,  0.01783419,  0.13062523, -0.01942245, -0.18787207,
        0.24485843,  0.0890732 ,  0.15354921, -0.02848417, -0.17805465,
        0.12659959,  0.07361489,  0.11841691, -0.0817158 , -0.09146189,
       -0.15631667,  0.07889554,  0.06325027, -0.21279941,  0.22228   ,
       -0.11718205,  0.13774644,  0.15049173,  0.13688704,  0.33995184,
       -0.12521227,  0.01028001, -0.12642032,  0.07831606, -0.0252238 ,
       -0.03395513,  0.03965646,  0.22474508, -0.13310082,  0.13553855,
       -0.10668604,  0.22141342,  0.06522292,  0.14127599,  0.08240495,
        0.03247302,  0.30142626,  0.09993532, -0.18855172,  0.01956543,
        0.16998382, -0.155719  , -0.06757715,  0.17540725,  0.02754072,
        0.04505057, -0.06942102,  0.04041849,  0.33680534, -0.03090001,
       -0.08459242,  0.00468331, -0.08084729,  0.15038815,  0.2194476 ,
        0.10415938, -0.02096822,  0.26186588, -0.00954993,  0.1127312 ,
        0.14906277, -0.0927472 ,  0.19095857,  0.24790056, -0.003826  ,
        0.04918066, -0.02232081, -0.03569063, -0.17610529, -0.08925602,
       -0.06415266,  0.28868139,  0.01529911, -0.22414474, -0.15126266,
        0.24473965, -0.09966447,  0.22041951,  0.17169574,  0.12241554,
       -0.1190941 , -0.2071649 ,  0.04336704], dtype=float32)
用NLP模型表达我们的X
接着,我们于是就可以用这个坐标,来表示我们的之前干干净净的X。

但是这儿有个问题。我们的vec是基于每个单词的,怎么办呢?

由于我们文本本身的量很小,我们可以把所有的单词的vector拿过来取个平均值:

# 先拿到全部的vocabulary
vocab = model.vocab

# 得到任意text的vector
def get_vector(word_list):
    # 建立一个全是0的array
    res =np.zeros([128])
    count = 0
    for word in word_list:
        if word in vocab:
            res += model[word]
            count += 1
    return res/count    
此时,我们得到了一个取得任意word list平均vector值得方法:

get_vector(['hello', 'from', 'the', 'other', 'side'])
array([-0.31350832,  0.04835839,  0.0048861 ,  0.11434336, -0.1799269 ,
        0.05557305, -0.02394118,  0.16106121,  0.1858674 , -0.04397187,
        0.10581181, -0.04210376,  0.1221713 , -0.05319506,  0.04339079,
        0.0631889 ,  0.1334364 ,  0.10450788, -0.13369248,  0.17621091,
        0.11581808, -0.07839958,  0.01126511, -0.03497357,  0.1140593 ,
        0.15078972, -0.06713609,  0.17556626,  0.04463732,  0.14599135,
        0.05664013,  0.14580157,  0.13244719,  0.10890759, -0.08809417,
        0.02204922,  0.02513832, -0.10544483, -0.09301682, -0.04607506,
       -0.0043104 ,  0.03133655,  0.13699191,  0.01113589, -0.19221411,
        0.23371264,  0.07863618,  0.16435402, -0.01011975, -0.18208385,
        0.1232647 ,  0.09053386,  0.11891054, -0.0958069 , -0.06351973,
       -0.13449311,  0.08414212,  0.08572642, -0.18276297,  0.2460763 ,
       -0.13310654,  0.1621725 ,  0.14568455,  0.16453338,  0.32360496,
       -0.16287505,  0.0061395 , -0.13277827,  0.06658031, -0.00849631,
       -0.04312018,  0.05207892,  0.23483992, -0.12247395,  0.14737971,
       -0.09834758,  0.23857855,  0.10417985,  0.19205472,  0.07291839,
        0.05432127,  0.31228056,  0.10667485, -0.19113681,  0.02985532,
        0.17966536, -0.17697723, -0.07287586,  0.17143352, -0.00642007,
        0.03691518, -0.06739308,  0.06709844,  0.3335989 , -0.00951616,
       -0.10647952,  0.02660648, -0.07802326,  0.15588878,  0.23749367,
        0.10878561,  0.01947832,  0.21858906,  0.00061314,  0.14626372,
        0.16053095, -0.11741858,  0.22829354,  0.21768039, -0.00993046,
        0.08663368, -0.03933012, -0.06405959, -0.19196833, -0.08962602,
       -0.09244292,  0.31830364, -0.00367699, -0.220584  , -0.13564284,
        0.23458903, -0.12524679,  0.21924314,  0.1900594 ,  0.11750702,
       -0.10788013, -0.23775842,  0.04254359])
这样,我们可以同步把我们的X都给转化成128维的一个vector list

(为了之后内容的方便,我先把之前我们处理好的wordlist给存下来。)

wordlist_train = X_train
wordlist_test = X_test

X_train = [get_vector(x) for x in X_train]
X_test = [get_vector(x) for x in X_test]

print(X_train[10])
[-0.49616703  0.09831359  0.00215805  0.21753911 -0.24626317  0.06163961
 -0.01088745  0.24991728  0.23802179 -0.06698225  0.20064797 -0.06599116
  0.19794046 -0.07558411  0.07941745  0.06008136  0.21772295  0.1405834
 -0.18259355  0.31937215  0.17264133 -0.15873611  0.07915295 -0.04948181
  0.12474476  0.228815   -0.0983179   0.28874519  0.03865414  0.23373656
  0.02709786  0.26116451  0.17697223  0.19303173 -0.07914471  0.0832512
  0.03469482 -0.15666168 -0.07268126 -0.11324668 -0.01033463  0.05544584
  0.18356295 -0.01587121 -0.29344295  0.47934875  0.09503752  0.19703337
 -0.00700106 -0.28184425  0.23042896  0.11191312  0.18606185 -0.08041457
 -0.14015471 -0.24658055  0.11680572  0.07711736 -0.33447557  0.35767368
 -0.17762948  0.26263842  0.23744292  0.23772386  0.49314497 -0.19214054
 -0.0103213  -0.20215777  0.07296897 -0.02735564 -0.07290107  0.07402693
  0.38805058 -0.21641682  0.2155705  -0.12779231  0.35236881  0.13735026
  0.26753956  0.14389433  0.06487844  0.48379081  0.16635294 -0.29834936
  0.04311308  0.29604598 -0.19924251 -0.1253365   0.32512026  0.01877471
  0.08315832 -0.08786424  0.07361021  0.53263441  0.02459062 -0.15883806
  0.01811006 -0.13394059  0.2364613   0.32745286  0.19984239 -0.0058238
  0.4130943  -0.00106707  0.17232155  0.28846937 -0.15492516  0.33995942
  0.37123214 -0.04274277  0.11453501 -0.0671717  -0.06910405 -0.27783239
 -0.14075597 -0.12174125  0.51931463 -0.03699218 -0.40793285 -0.23604008
  0.36195668 -0.20882112  0.377449    0.29138071  0.18641824 -0.16645377
 -0.40529569  0.00538284]
建立ML模型
这里,因为我们128维的每一个值都是连续关系的。不是分裂开考虑的。所以,道理上讲,我们是不太适合用RandomForest这类把每个column当做单独的variable来看的方法。(当然,事实是,你也可以这么用)

好的,我们来看看比较适合连续函数的方法:SVM

from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score

params = [0.1,0.5,1,3,5,7,10,12,16,20,25,30,35,40]
test_scores = []
for param in params:
    clf = SVR(gamma=param)
    test_score = cross_val_score(clf, X_train, y_train, cv=3, scoring='roc_auc')
    test_scores.append(np.mean(test_score))
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(params, test_scores)
plt.title("Param vs CV AUC Score");
'''
用CNN来提升逼格
有些同学也许会说,这也太扯了吧,直接平均了一下vec值,就能拿来跑?

哈哈,当然不会这么简单。

我们必然是有更严谨的处理方式。

比如:

用vector表示出一个大matrix,并用CNN做“降维+注意力”

(为了演示的方便,下面内容我会把整个case搞得简单点。要是想更加复杂准确的话,直接调整参数,往大了调,就行)

首先,我们确定一个padding_size。

什么是padding size?

就是为了让我们生成的matrix是一样的size啊。。(具体见课件)

(这里其实我们可以最简单地调用keras的sequence方法来做,但是我想让大家更清楚的明白一下,内部发生了什么)
'''
# 说明,对于每天的新闻,我们会考虑前256个单词。不够的我们用[000000]补上
# vec_size 指的是我们本身vector的size
def transform_to_matrix(x, padding_size=256, vec_size=128):
    res = []
    for sen in x:
        matrix = []
        for i in range(padding_size):
            try:
                matrix.append(model[sen[i]].tolist())
            except:
                # 这里有两种except情况,
                # 1. 这个单词找不到
                # 2. sen没那么长
                # 不管哪种情况,我们直接贴上全是0的vec
                matrix.append([0] * vec_size)
        res.append(matrix)
    return res
    
#这时候,我们把我们原本的word list跑一遍:

X_train = transform_to_matrix(wordlist_train)
X_test = transform_to_matrix(wordlist_test)

print(X_train[123])
'''
可以看到,现在我们得到的就是一个大大的Matrix,它的size是 128 * 256

每一个这样的matrix,就是对应了我们每一个数据点

在进行下一步之前,我们把我们的input要reshape一下。

原因是我们要让每一个matrix外部“包裹”一层维度。来告诉我们的CNN model,我们的每个数据点都是独立的。之间木有前后关系。

(其实对于股票这个case,前后关系是存在的。这里不想深究太多这个问题。有兴趣的同学可以谷歌CNN+LSTM这种强大带记忆的深度学习模型。)
'''
# 搞成np的数组,便于处理
X_train = np.array(X_train)
X_test = np.array(X_test)

# 看看数组的大小
print(X_train.shape)
print(X_test.shape)
'''
(1611, 256, 128) # 训练集,每个句子单词数,每个单词向量长度
(378, 256, 128)
'''
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1], X_train.shape[2]) # 注意1的位置,前后语境长度为1
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1], X_test.shape[2])

print(X_train.shape)
print(X_test.shape)
'''
(1611, 1, 256, 128)
(378, 1, 256, 128)
'''
#接下来,我们安安稳稳的定义我们的CNN模型

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Convolution2D, MaxPooling2D
from keras.layers.core import Dense, Dropout, Activation, Flatten

# set parameters:
batch_size = 32
n_filter = 16
filter_length = 4
nb_epoch = 5
n_pool = 2

# 新建一个sequential的模型
model = Sequential()
model.add(Convolution2D(n_filter,filter_length,filter_length, #2维CNN,
                        input_shape=(1, 256, 128)))
model.add(Activation('relu'))
model.add(Convolution2D(n_filter,filter_length,filter_length))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(n_pool, n_pool))) #池化层,取最大值或者平均值
model.add(Dropout(0.25))
model.add(Flatten()) # 需要把二维变成一维
# 后面接上一个ANN
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(1)) # 输出层要是1
model.add(Activation('softmax'))
# compile模型
model.compile(loss='mse',
              optimizer='adadelta',
              metrics=['accuracy'])
Using Theano backend.
#接着,我们就可以放进我们的xy了

model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch,
          verbose=0)
score = model.evaluate(X_test, y_test, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])
'''
Test score: 0.492063492221
Test accuracy: 0.507936509829
思考:
虽然我们这里使用了word2vec,但是对CNN而言,管你3721的input是什么,只要符合规矩,它都可以process。

这其实就给了我们更多的“发散”空间:

我们可以用ASCII码(0,256)来表达每个位置上的字符,并组合成一个大大的matrix。

这样牛逼之处在于,你都不需要用preprocessing了,因为每一个字符的可以被表示出来,并且都有意义。

另外,你也可以使用不同的分类器:

我们这里使用了最简单的神经网络~

你可以用LSTM或者RNN等接在CNN的那句Flatten之后
'''

FastText做文本分类

#用每日新闻预测金融市场变化(进阶版)
#这篇教程里,我们会使用FastText来做分类

import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from datetime import date
#监视数据
#我们先读入数据。这里我提供了一个已经combine好了的数据。

data = pd.read_csv('../input/Combined_News_DJIA.csv')
#这时候,我们可以看一下数据长什么样子

data.head()
'''
Date	Label	Top1	Top2	Top3	Top4	Top5	Top6	Top7	Top8	...	Top16	Top17	Top18	Top19	Top20	Top21	Top22	Top23	Top24	Top25
0	2008-08-08	0	b"Georgia 'downs two Russian warplanes' as cou...	b'BREAKING: Musharraf to be impeached.'	b'Russia Today: Columns of troops roll into So...	b'Russian tanks are moving towards the capital...	b"Afghan children raped with 'impunity,' U.N. ...	b'150 Russian tanks have entered South Ossetia...	b"Breaking: Georgia invades South Ossetia, Rus...	b"The 'enemy combatent' trials are nothing but...	...	b'Georgia Invades South Ossetia - if Russia ge...	b'Al-Qaeda Faces Islamist Backlash'	b'Condoleezza Rice: "The US would not act to p...	b'This is a busy day: The European Union has ...	b"Georgia will withdraw 1,000 soldiers from Ir...	b'Why the Pentagon Thinks Attacking Iran is a ...	b'Caucasus in crisis: Georgia invades South Os...	b'Indian shoe manufactory - And again in a se...	b'Visitors Suffering from Mental Illnesses Ban...	b"No Help for Mexico's Kidnapping Surge"
1	2008-08-11	1	b'Why wont America and Nato help us? If they w...	b'Bush puts foot down on Georgian conflict'	b"Jewish Georgian minister: Thanks to Israeli ...	b'Georgian army flees in disarray as Russians ...	b"Olympic opening ceremony fireworks 'faked'"	b'What were the Mossad with fraudulent New Zea...	b'Russia angered by Israeli military sale to G...	b'An American citizen living in S.Ossetia blam...	...	b'Israel and the US behind the Georgian aggres...	b'"Do not believe TV, neither Russian nor Geor...	b'Riots are still going on in Montreal (Canada...	b'China to overtake US as largest manufacturer'	b'War in South Ossetia [PICS]'	b'Israeli Physicians Group Condemns State Tort...	b' Russia has just beaten the United States ov...	b'Perhaps *the* question about the Georgia - R...	b'Russia is so much better at war'	b"So this is what it's come to: trading sex fo...
2	2008-08-12	0	b'Remember that adorable 9-year-old who sang a...	b"Russia 'ends Georgia operation'"	b'"If we had no sexual harassment we would hav...	b"Al-Qa'eda is losing support in Iraq because ...	b'Ceasefire in Georgia: Putin Outmaneuvers the...	b'Why Microsoft and Intel tried to kill the XO...	b'Stratfor: The Russo-Georgian War and the Bal...	b"I'm Trying to Get a Sense of This Whole Geor...	...	b'U.S. troops still in Georgia (did you know t...	b'Why Russias response to Georgia was right'	b'Gorbachev accuses U.S. of making a "serious ...	b'Russia, Georgia, and NATO: Cold War Two'	b'Remember that adorable 62-year-old who led y...	b'War in Georgia: The Israeli connection'	b'All signs point to the US encouraging Georgi...	b'Christopher King argues that the US and NATO...	b'America: The New Mexico?'	b"BBC NEWS | Asia-Pacific | Extinction 'by man...
3	2008-08-13	0	b' U.S. refuses Israel weapons to attack Iran:...	b"When the president ordered to attack Tskhinv...	b' Israel clears troops who killed Reuters cam...	b'Britain\'s policy of being tough on drugs is...	b'Body of 14 year old found in trunk; Latest (...	b'China has moved 10 *million* quake survivors...	b"Bush announces Operation Get All Up In Russi...	b'Russian forces sink Georgian ships '	...	b'Elephants extinct by 2020?'	b'US humanitarian missions soon in Georgia - i...	b"Georgia's DDOS came from US sources"	b'Russian convoy heads into Georgia, violating...	b'Israeli defence minister: US against strike ...	b'Gorbachev: We Had No Choice'	b'Witness: Russian forces head towards Tbilisi...	b' Quarter of Russians blame U.S. for conflict...	b'Georgian president says US military will ta...	b'2006: Nobel laureate Aleksander Solzhenitsyn...
4	2008-08-14	1	b'All the experts admit that we should legalis...	b'War in South Osetia - 89 pictures made by a ...	b'Swedish wrestler Ara Abrahamian throws away ...	b'Russia exaggerated the death toll in South O...	b'Missile That Killed 9 Inside Pakistan May Ha...	b"Rushdie Condemns Random House's Refusal to P...	b'Poland and US agree to missle defense deal. ...	b'Will the Russians conquer Tblisi? Bet on it,...	...	b'Bank analyst forecast Georgian crisis 2 days...	b"Georgia confict could set back Russia's US r...	b'War in the Caucasus is as much the product o...	b'"Non-media" photos of South Ossetia/Georgia ...	b'Georgian TV reporter shot by Russian sniper ...	b'Saudi Arabia: Mother moves to block child ma...	b'Taliban wages war on humanitarian aid workers'	b'Russia: World "can forget about" Georgia\'s...	b'Darfur rebels accuse Sudan of mounting major...	b'Philippines : Peace Advocate say Muslims nee...
5 rows × 27 columns
'''
#其实看起来特别的简单直观。如果是1,那么当日的DJIA就提高或者不变了。如果是1,那么DJIA那天就是跌了。

#分割测试/训练集
#这下,我们可以先把数据给分成Training/Testing data

train = data[data['Date'] < '2015-01-01']
test = data[data['Date'] > '2014-12-31']
#然后,我们把每条新闻做成一个单独的句子,集合在一起:

X_train = train[train.columns[2:]]
corpus = X_train.values.flatten().astype(str)

X_train = X_train.values.astype(str)
X_train = np.array([' '.join(x) for x in X_train])
X_test = test[test.columns[2:]]
X_test = X_test.values.astype(str)
X_test = np.array([' '.join(x) for x in X_test])
y_train = train['Label'].values
y_test = test['Label'].values
#这里我们注意,我们需要三样东西:

#corpus是全部我们『可见』的文本资料。我们假设每条新闻就是一句话,把他们全部flatten()了,我们就会得到list of sentences。

#同时我们的X_train和X_test可不能随便flatten,他们需要与y_train和y_test对应。

corpus[:3]
'''
array([ 'b"Georgia \'downs two Russian warplanes\' as countries move to brink of war"',
       "b'BREAKING: Musharraf to be impeached.'",
       "b'Russia Today: Columns of troops roll into South Ossetia; footage from fighting (YouTube)'"], 
      dtype='
X_train[:1]
'''
array([ 'b"Georgia \'downs two Russian warplanes\' as countries move to brink of war" b\'BREAKING: Musharraf to be impeached.\' b\'Russia Today: Columns of troops roll into South Ossetia; footage from fighting (YouTube)\' b\'Russian tanks are moving towards the capital of South Ossetia, which has reportedly been completely destroyed by Georgian artillery fire\' b"Afghan children raped with \'impunity,\' U.N. official says - this is sick, a three year old was raped and they do nothing" b\'150 Russian tanks have entered South Ossetia whilst Georgia shoots down two Russian jets.\' b"Breaking: Georgia invades South Ossetia, Russia warned it would intervene on SO\'s side" b"The \'enemy combatent\' trials are nothing but a sham: Salim Haman has been sentenced to 5 1/2 years, but will be kept longer anyway just because they feel like it." b\'Georgian troops retreat from S. Osettain capital, presumably leaving several hundred people killed. [VIDEO]\' b\'Did the U.S. Prep Georgia for War with Russia?\' b\'Rice Gives Green Light for Israel to Attack Iran: Says U.S. has no veto over Israeli military ops\' b\'Announcing:Class Action Lawsuit on Behalf of American Public Against the FBI\' b"So---Russia and Georgia are at war and the NYT\'s top story is opening ceremonies of the Olympics?  What a fucking disgrace and yet further proof of the decline of journalism." b"China tells Bush to stay out of other countries\' affairs" b\'Did World War III start today?\' b\'Georgia Invades South Ossetia - if Russia gets involved, will NATO absorb Georgia and unleash a full scale war?\' b\'Al-Qaeda Faces Islamist Backlash\' b\'Condoleezza Rice: "The US would not act to prevent an Israeli strike on Iran." Israeli Defense Minister Ehud Barak: "Israel is prepared for uncompromising victory in the case of military hostilities."\' b\'This is a busy day:  The European Union has approved new sanctions against Iran in protest at its nuclear programme.\' b"Georgia will withdraw 1,000 soldiers from Iraq to help fight off Russian forces in Georgia\'s breakaway region of South Ossetia" b\'Why the Pentagon Thinks Attacking Iran is a Bad Idea - US News & World Report\' b\'Caucasus in crisis: Georgia invades South Ossetia\' b\'Indian shoe manufactory  - And again in a series of "you do not like your work?"\' b\'Visitors Suffering from Mental Illnesses Banned from Olympics\' b"No Help for Mexico\'s Kidnapping Surge"'], 
      dtype='
y_train[:5]
array([0, 1, 0, 0, 1])
来,我们再把每个单词给分隔开:

同样,corpus和X_train的处理不同

from nltk.tokenize import word_tokenize

corpus = [word_tokenize(x) for x in corpus]
X_train = [word_tokenize(x) for x in X_train]
X_test = [word_tokenize(x) for x in X_test]
tokenize完毕后,

我们可以看到,虽然corpus和x都是一个二维数组,但是他们的意义不同了。

corpus里,第二维数据是一个个句子。

x里,第二维数据是一个个数据点(对应每个label)


corpus[:2]
[['b',
  "''",
  'Georgia',
  "'downs",
  'two',
  'Russian',
  'warplanes',
  "'",
  'as',
  'countries',
  'move',
  'to',
  'brink',
  'of',
  'war',
  "''"],
 ["b'BREAKING", ':', 'Musharraf', 'to', 'be', 'impeached', '.', "'"]]
预处理
我们进行一些预处理来把我们的文本资料变得更加统一:

小写化

删除停止词

删除数字与符号

lemma

我们把这些功能合为一个func:

# 停止词
from nltk.corpus import stopwords
stop = stopwords.words('english')

# 数字
import re
def hasNumbers(inputString):
    return bool(re.search(r'\d', inputString))

# 特殊符号
def isSymbol(inputString):
    return bool(re.match(r'[^\w]', inputString))

# lemma
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def check(word):
    """
    如果需要这个单词,则True
    如果应该去除,则False
    """
    word= word.lower()
    if word in stop:
        return False
    elif hasNumbers(word) or isSymbol(word):
        return False
    else:
        return True

# 把上面的方法综合起来
def preprocessing(sen):
    res = []
    for word in sen:
        if check(word):
            # 这一段的用处仅仅是去除python里面byte存str时候留下的标识。。之前数据没处理好,其他case里不会有这个情况
            word = word.lower().replace("b'", '').replace('b"', '').replace('"', '').replace("'", '')
            res.append(wordnet_lemmatizer.lemmatize(word))
    return res
把我们三个数据组都来处理一下:

corpus = [preprocessing(x) for x in corpus]
X_train = [preprocessing(x) for x in X_train]
X_test = [preprocessing(x) for x in X_test]
我们再来看看处理之后的数据长相:

print(corpus[553])
print(X_train[523])
['north', 'korean', 'leader', 'kim', 'jong-il', 'confirmed', 'ill']
['two', 'redditors', 'climbing', 'mt', 'kilimanjaro', 'charity', 'bidding', 'peak', 'nt', 'squander', 'opportunity', 'let', 'upvotes', 'something', 'awesome', 'estimated', 'take', 'year', 'clear', 'lao', 'explosive', 'remnant', 'left', 'behind', 'united', 'state', 'bomber', 'year', 'ago', 'people', 'died', 'unexploded', 'ordnance', 'since', 'conflict', 'ended', 'fidel', 'ahmadinejad', 'slandering', 'jew', 'mossad', 'america', 'israel', 'intelligence', 'agency', 'target', 'united', 'state', 'intensively', 'among', 'nation', 'considered', 'friendly', 'washington', 'israel', 'lead', 'others', 'active', 'espionage', 'directed', 'american', 'company', 'defense', 'department', 'australian', 'election', 'day', 'poll', 'rural/regional', 'independent', 'member', 'parliament', 'support', 'labor', 'minority', 'goverment', 'julia', 'gillard', 'prime', 'minister', 'france', 'plan', 'raise', 'retirement', 'age', 'set', 'strike', 'britain', 'parliament', 'police', 'murdoch', 'paper', 'adviser', 'pm', 'implicated', 'voicemail', 'hacking', 'scandal', 'british', 'policeman', 'jailed', 'month', 'cell', 'attack', 'woman', 'rest', 'email', 'display', 'fundemental', 'disdain', 'pluralistic', 'america', 'reveals', 'chilling', 'level', 'islamophobia', 'hatemongering', 'church', 'plan', 'burn', 'quran', 'endanger', 'troop', 'u', 'commander', 'warns', 'freed', 'journalist', 'tricked', 'captor', 'twitter', 'access', 'manila', 'water', 'crisis', 'expose', 'impact', 'privatisation', 'july', 'week-long', 'rationing', 'water', 'highlighted', 'reality', 'million', 'people', 'denied', 'basic', 'right', 'potable', 'water', 'sanitation', 'private', 'firm', 'rake', 'profit', 'expense', 'weird', 'uk', 'police', 'ask', 'help', 'case', 'slain', 'intelligence', 'agent', 'greenpeace', 'japan', 'anti-whaling', 'activist', 'found', 'guilty', 'theft', 'captured', 'journalist', 'trick', 'captor', 'revealing', 'alive', 'creepy', 'biometric', 'id', 'forced', 'onto', 'india', 'billion', 'inhabitant', 'fear', 'loss', 'privacy', 'government', 'abuse', 'abound', 'india', 'gear', 'biometrically', 'identify', 'number', 'billion', 'inhabitant', 'china', 'young', 'officer', 'syndrome', 'china', 'military', 'spending', 'growing', 'fast', 'overtaken', 'strategy', 'said', 'professor', 'huang', 'jing', 'school', 'public', 'policy', 'young', 'officer', 'taking', 'control', 'strategy', 'like', 'young', 'officer', 'japan', 'mexican', 'soldier', 'open', 'fire', 'family', 'car', 'military', 'checkpoint', 'killing', 'father', 'son', 'death', 'toll', 'continues', 'climb', 'guatemala', 'landslide', 'foreign', 'power', 'stop', 'interfering', 'case', 'iranian', 'woman', 'sentenced', 'death', 'stoning', 'iran', 'foreign', 'ministry', 'said', 'mexican', 'official', 'gunman', 'behind', 'massacre', 'killed', 'tv', 'anchor', 'stabbed', 'death', 'outside', 'kabul', 'home', 'mosque', 'menace', 'confined', 'lower', 'manhattan', 'many', 'european', 'country', 'similar', 'alarm', 'sounded', 'muslim', 'coming', 'french', 'citizen', 'barred', 'american', 'military', 'base', 'dutch', 'neo-nazi', 'donates', 'sperm', 'white', 'dutch', 'neo-nazi', 'offered', 'donate', 'sperm', 'four', 'fertility', 'clinic', 'netherlands', 'effort', 'promote', 'call', 'strong', 'white', 'race']

训练NLP模型
有了这些干净的数据集,我们可以做我们的NLP模型了。

我们这里要用的是FastText。

原理,我在课件上已经讲过了,这里我们来进一步看看具体的使用。

由于这篇paper刚刚发布,很多社区贡献者也都在给社区提供代码,尽早实现python版本的开源编译(我也是其中之一)。

当然,因为Facebook团队本身已经在GitHub上放出了源代码(C++),

所以,我们可以用一个python wrapper来造个interface,方便我们调用。

首先,我们讲过,FT把label也看做一个元素,带进了word2vec的网络中。

那么,我们就需要把这个label塞进我们的“句子”中:

for i in range(len(y_train)):
    label = '__label__' + str(y_train[i])
    X_train[i].append(label)

print(X_train[49])
'''
['the', 'man', 'podium', 'dutch', 'non-profit', 'reproductive', 'health', 'organization', 'sail', 'ship', 'around', 'world', 'anchoring', 'international', 'water', 'provide', 'abortion', 'woman', 'country', 'abortion', 'banned', 'b', 'grand', 'ayatollah', 'issue', 'decree', 'calling', 'muslim', 'defend', 'iraq', 'christian', 'marx', 'da', 'kapital', 'sale', 'soar', 'among', 'young', 'german', 'a', 'man', 'england', 'killed', 'wife', 'changed', 'facebook', 'relationship', 'status', 'single', 'georgia', 'used', 'cluster', 'bomb', 'august', 'war', 'arctic', 'temperature', 'break', 'all-time', 'recorded', 'high', 'reddit', 'please', 'send', 'help', 'uk', 'politician', 'insane', 'apparently', 'monitoring', 'mobile', 'web', 'record', 'would', 'giving', 'licence', 'terrorist', 'kill', 'people', 'wow', 'secret', 'coded', 'message', 'embedded', 'child', 'pornographic', 'image', 'paedophile', 'website', 'exploited', 'secure', 'way', 'passing', 'information', 'terrorist', 'england', 'run', 'honey', 'christmas', 'catastrophic', 'honeybee', 'decline', 'b', 'iran', 'stop', 'executing', 'youth', 'china', 'watch', 'internet', 'caf', 'customer', 'web', 'crackdown', 'china\\', 'medium', 'freedom', 'reduced', 'new', 'measure', 'include', 'camera', 'internet', 'cafe', 'picture', 'taken', 'user', 'bali', 'bombing', 'new', 'suspect', 'hindu', 'american', 'foundation', 'petition', 'ny', 'time', 'focus', 'much', 'activity', 'christian', 'missionary', 'india', 'anti-christian', 'violence', 'a', 'quick', 'overview', 'islamic', 'terror', 'organization', 'get', 'funding', 'last', 'titantic', 'survivor', 'auction', 'memento', 'pay', 'nursing', 'home', 'better', 'hungary', 'get', 'loan', 'avert', 'meltdown', 'sao', 'paolo', 'hundred', 'black-clad', 'military', 'police', 'fired', 'teargas', 'stun', 'grenade', 'rubber', 'bullet', 'striking', 'civilian', 'officer', 'seeking', 'percent', 'pay', 'raise', 'austrailian', 'historian', 'arrested', 'holocaust', 'denial', 'defense', 'secretary', 'gate', 'said', 'prepared', 'reconciliation', 'taliban', 'part', 'political', 'outcome', 'afghanistan', 'is', 'switzerland', 'next', 'iceland', 'switzerland', 'forced', 'take', 'emergency', 'measure', 'yesterday', 'shore', 'two', 'biggest', 'lender', 'prevent', 'collapse', 'confidence', 'country\\', 'banking', 'system', 'police', 'battle', 'police', 'sao', 'paulo', 'civilian', 'killed', 'nato', 'air', 'strike', 'afghanistan', 'villager', 'the', 'west', 'loss', 'afghanistan', '__label__0']
'''
然后,我们把数据存成文件的形式。因为我们这里的FastText只是个python的interface。调用起来还得用C++的接口。

我们需要存三个东西:

含有label的train集

不含label的test集

label单独放一个文件

X_train = [' '.join(x) for x in X_train]

print(X_train[12])
'''
north korea halt denuclearisation u fails remove list state sponsoring terrorism child among dead u airstrike afghanistan the russian parliament voted overwhelmingly officially recognize independence abkhazia south ossetia violent animal right activist set fire scientist home little protection available scientist nbc censored olympic champion matthew mitcham gay un say convincing evidence show u airstrike afghanistan killed people including child italy try outlaw islam mystery virus kill israeli group peace say settlement construction occupied west bank nearly doubled since last year b revealed britain secret propaganda war al-qaida b israel settlement surge draw rice criticism solar powered carbon neutral pyramid house million people dubai russia claim proof genocide how nato transformed military alliance quasi-united nation cartwheeling banned school philly-area activist released china jeff said slapped around threatend saying want head cut want shot b vatican describes hindu attack christian orphanage god protester tell tale beijing detention- sleep deprivation threat oh python kill zookeeper kelly murdered say uk intelligence insider b fury image myra hindley appears british film olympics party b north korea suspend nuclear disablement german suspect bayer pesticide beehive collapse research terrorism invaluable fear arrest top u diplomat escape gun attack pakistan __label__1
同理,test集也这样。
'''
X_test = [' '.join(x) for x in X_test]

with open('../input/train_ft.txt', 'w') as f:
    for sen in X_train:
        f.write(sen+'\n')

with open('../input/test_ft.txt', 'w') as f:
    for sen in X_test:
        f.write(sen+'\n')

with open('../input/test_label_ft.txt', 'w') as f:
    for label in y_test:
        f.write(str(label)+'\n')
调用FastText模块
import fasttext

clf = fasttext.supervised('../input/train_ft.txt', 'model', dim=256, ws=5, neg=5, epoch=100, min_count=10, lr=0.1, lr_update_rate=1000, bucket=200000)
训练完我们的FT模型后,我们可以测试我们的Test集了

y_scores = []

# 我们用predict来给出判断
labels = clf.predict(X_test)

y_preds = np.array(labels).flatten().astype(int)

# 我们来看看
print(len(y_test))
print(y_test)
print(len(y_preds))
print(y_preds)

from sklearn import metrics

# 算个AUC准确率
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_preds, pos_label=1)
print(metrics.auc(fpr, tpr))
378
[1 0 0 1 1 0 0 0 0 0 1 1 1 1 0 1 0 0 1 0 1 1 1 1 0 0 1 0 1 1 1 0 0 1 0 1 1
 0 0 1 0 0 1 0 1 0 0 1 0 1 0 1 0 1 0 0 0 0 1 1 0 0 1 1 0 1 1 1 0 1 1 0 0 1
 0 1 1 1 0 1 0 0 1 1 0 0 1 1 0 0 0 1 1 1 1 0 1 0 0 1 0 0 1 0 1 0 0 0 0 1 1
 0 0 1 1 1 0 1 1 0 0 1 0 1 1 0 0 1 0 1 1 1 1 0 1 0 1 0 0 0 0 0 1 1 0 0 0 0
 0 0 0 1 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 1 1 0 1 0 1 1 0 1 1 0 0 1 0 0 0 1
 0 1 1 0 1 1 1 1 1 1 1 0 0 1 1 1 0 0 1 1 0 0 1 0 0 1 1 0 0 1 0 1 0 0 0 1 1
 1 0 1 0 1 1 0 0 1 0 0 1 0 0 0 1 0 1 1 1 0 0 1 1 1 0 0 1 0 0 0 1 0 0 0 1 1
 0 1 0 1 0 1 1 0 1 0 1 1 0 0 1 1 0 0 0 0 0 1 1 1 0 0 1 0 1 1 0 0 1 1 1 1 1
 0 1 0 1 1 1 1 1 1 1 0 0 1 1 1 1 0 1 0 0 1 0 1 0 1 1 1 0 1 1 1 0 1 0 1 1 0
 0 1 0 0 1 1 0 1 0 1 0 1 0 0 0 1 0 1 1 0 1 0 1 1 0 1 1 1 0 0 0 0 0 1 0 1 1
 0 1 0 0 1 1 1 1]
378
[0 1 0 1 1 1 1 1 0 1 0 0 1 1 0 1 1 1 0 1 1 0 0 1 1 0 1 1 1 0 1 1 0 0 0 1 1
 1 1 1 0 1 1 0 0 1 1 0 1 0 1 0 1 1 1 0 1 1 1 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0
 1 1 0 1 0 0 1 1 0 0 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 1 0 0 1 1 1 1 0 1 0 1
 1 1 0 0 1 1 1 1 1 0 0 1 0 0 1 1 0 1 0 0 1 0 1 0 1 1 0 0 0 1 0 0 1 1 1 1 1
 0 0 1 1 0 0 0 1 1 0 1 1 1 0 1 0 1 0 0 1 1 0 0 1 1 0 0 1 1 1 0 1 1 1 0 1 1
 1 1 1 1 0 1 0 0 0 1 0 1 1 1 0 1 1 1 1 1 1 0 0 1 1 1 0 1 1 1 0 0 1 0 0 1 0
 1 0 1 0 0 0 1 1 0 0 1 1 1 1 0 1 1 1 1 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0 1 1 1
 1 1 0 0 0 1 1 0 1 1 1 0 1 0 1 1 0 0 0 1 1 1 0 1 1 0 1 1 0 0 0 0 0 0 0 1 1
 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 0 1 1 1 1 0
 1 1 1 1 1 0 1 1 1 0 0 0 1 1 1 1 0 0 0 1 0 0 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0
 1 1 1 0 1 1 0 1]
0.463877688172
同理,这里,我们通过parameter tuning或者是resampling,可以让我们的结果更加好。

当然,因为FT本身也是一个word2vec。并且自带了一个类似于二叉树的分类器在后面。

这样,在小量数据上,是跑不出很理想的结论的,还不如我们自己带上一个SVM的效果。

但是面对大量数据和大量label,它的效果就体现出来了。

 

你可能感兴趣的:(NLP)