import NLTK
sent= ""
tokens = NLTK.word_tokenize(sent)


import jieba
seg_list = jieba.cut("",cut_all = True)
seg_list = jieba.cut("",cut_all = Flase)
seg_list = jieba.cut_for_search("")


import re
emoticons_str = r""
regex_str = [emoticons_str, r"",r"",r""]


stemming 词干提取,只留下词根

from nltk.atom.portar import PorterStemmer
porter_atomer = PorterStemmer()

from nltk.stem.lancaster import LancasterStemmer
from nltk.atom import SnowballStemmer
from nltk.atom.porter import PorterStemmer



from nltk.stem import WordNetLemmatizer





from nltk.corpus import stopwords
# token
# filter
filtered_words = [word for word in word_list if word not in stop.words('english')]



sentiment_dictionary = {}
for line in open('')
	word,score = line.split('\t')
	sentiment_dictionary(word) = int(score)
total_score = sum(sentiment_dictionary.get(word,0) for word in words)


from nltk.classify import NaiveBayesClassifier
s1 = ""
s2 = ""
s3 = ""
s4 = ""
def preprocess(s):
	return (word:True for word in s.lower().split())
training_data = [[preprocess(s1),'pos'],
model = NaiveBayesClassifier.train(training_data)



import nltk
from nltk import FreqDist

corpus = ""
tokens = nltk.word_tokenize(corpus)

dist = FreDist(tokens) # 得到一个词典,每个词分别对应出现的次数


standard_freq_vector = fdict.most_common(50) # 拿出最常用的50,字典形式,key和value分别是词和出现的次数,统计词频只是为了拿出最常用的词,这个词频后面是用不到的,只需要词频高的词组成的一个向量,然后新的句子放到这个向量里,对应有词的位置上的数会加1,输出的还是这个词向量,key和value分别是词和对应的出现次数(稀疏矩阵)
size = len(standard_freq_vector) # 得到一个词典,最常用的50个单词以及词频
# 按照出现频率的大小,记录下每一个单词的位置
def position_lookup(v): # 得到一个词典,key和value分别是单词以及对应的位置
	res = {}
	counter =0
	for word in v:
		res[word[0]] = counter
		counter += 1 
	return res
# 把标准的单词位置记录下来,得到的是常用单词以及对应的位置的词典
standad_position_dict = position_lookup(standard_freq_vector)
print(standard_position_dict) # 得到位置对照表
# 新的输入
sentence = "
freq_vector = [0]*size #建立一个同等长度的词向量
tokens = nltk.word_tokenize(sentence)
for word in tokens:
		fre_vector[standard_position_dict[word]] += 1 # 通过word对应到该词在向量中为位置,然后再词向量该位置上加1,表示出现了一次
	except KeyError:



TF:Term Frequency衡量一个term在文档中出现得有多频繁
TF(t) = (t在文档中出现的次数)/(文档中term总数)
IDF:Inverse Document Frequency 衡量一个term有多重要
IDF(t) = log_e(文档总数/含有t的文档总数)
那么TF(baby) = 0.03
IDF(baby) = log(10M/1000)=4
TF-IDF = 0.03.*4=0.12


from nltk.text import TextCollectionn

corpus = TextCollection(["","",""]) # 这个类会自动短句,做统计,做计算

print(corpus.tf_idf("","")) # 直接使用该函数,参数为词和所在的话
# 要得到一个标准化的向量(相同的长度)
new_sentence = ""
for word in standard_vocab: # 遍历标准词,求出每个单独的词在新句子中的词频,最后每一个新句子都可以得到一个相同长度的向量






#本篇的教程里会尽量用点不一样的库,让大家感受一下Python NLP领域各个库的优缺点。


import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor # 随机森林
from nltk.stem.snowball import SnowballStemmer # 预处理

df_train = pd.read_csv('../input/train.csv', encoding="ISO-8859-1") # 注意编码方式
df_test = pd.read_csv('../input/test.csv', encoding="ISO-8859-1")

df_desc = pd.read_csv('../input/product_descriptions.csv')

#id	product_uid	product_title	search_term	relevance
#0	2	100001	Simpson Strong-Tie 12-Gauge Angle	angle bracket	3.00
#1	3	100001	Simpson Strong-Tie 12-Gauge Angle	l bracket	2.50
#2	9	100002	BEHR Premium Textured DeckOver 1-gal. #SC-141 ...	deck over	3.00
#3	16	100005	Delta Vero 1-Handle Shower Only Faucet Trim Ki...	rain shower head	2.33
#4	17	100005	Delta Vero 1-Handle Shower Only Faucet Trim Ki...	shower only faucet	2.67
df_desc.head() # 对照表
#product_uid	product_description
#0	100001	Not only do angles make joints stronger, they ...
#1	100002	BEHR Premium Textured DECKOVER is an innovativ...
#2	100003	Classic architecture meets contemporary design...
#3	100004	The Grape Solar 265-Watt Polycrystalline PV So...
#4	100005	Update your bathroom with the Delta Vero Singl...

df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True) #左右合并df
#id	product_title	product_uid	relevance	search_term
#0	2	Simpson Strong-Tie 12-Gauge Angle	100001	3.00	angle bracket
#1	3	Simpson Strong-Tie 12-Gauge Angle	100001	2.50	l bracket
#2	9	BEHR Premium Textured DeckOver 1-gal. #SC-141 ...	100002	3.00	deck over
#3	16	Delta Vero 1-Handle Shower Only Faucet Trim Ki...	100005	2.33	rain shower head
#4	17	Delta Vero 1-Handle Shower Only Faucet Trim Ki...	100005	2.67	shower only faucet#

#(240760, 5)

df_all = pd.merge(df_all, df_desc, how='left', on='product_uid') # 继续合并
#id	product_title	product_uid	relevance	search_term	product_description
#0	2	Simpson Strong-Tie 12-Gauge Angle	100001	3.00	angle bracket	Not only do angles make joints stronger, they #...
#1	3	Simpson Strong-Tie 12-Gauge Angle	100001	2.50	l bracket	Not only do angles make joints stronger, they ...
#2	9	BEHR Premium Textured DeckOver 1-gal. #SC-141 ...	100002	3.00	deck over	BEHR Premium Textured DECKOVER is an innovativ...
#3	16	Delta Vero 1-Handle Shower Only Faucet Trim Ki...	100005	2.33	rain shower head	Update your bathroom with the Delta Vero Singl...
#4	17	Delta Vero 1-Handle Shower Only Faucet Trim Ki...	100005	2.67	shower only faucet	Update your bathroom with the Delta Vero Singl...

#Step 2: 文本预处理




stemmer = SnowballStemmer('english') # 英文的雪球处理

def str_stemmer(s): # 单词小写、分开做stem,再合并得到单词列表
    return " ".join([stemmer.stem(word) for word in s.lower().split()])

def str_common_word(str1, str2):
    return sum(int(str2.find(word)>=0) for word in str1.split())

df_all['search_term'] = df_all['search_term'].map(lambda x:str_stemmer(x)) #匿名函数,意思是将x(此列中的每个cell)中的每个词运行stem函数再返回
df_all['product_title'] = df_all['product_title'].map(lambda x:str_stemmer(x))
df_all['product_description'] = df_all['product_description'].map(lambda x:str_stemmer(x))
#Step 3: 自制文本特征


df_all['len_of_query'] = df_all['search_term'].map(lambda x:len(x.split())).astype(np.int64) 
df_all['commons_in_title'] = df_all.apply(lambda x:str_common_word(x['search_term'],x['product_title']), axis=1)
df_all['commons_in_desc'] = df_all.apply(lambda x:str_common_word(x['search_term'],x['product_description']), axis=1)


df_all = df_all.drop(['search_term','product_title','product_description'],axis=1)
#Step 4: 重塑训练/测试集


df_train = df_all.loc[df_train.index]
df_test = df_all.loc[df_test.index]
#留着上传的时候 能对的上号,将id取出

test_ids = df_test['id']
y_train = df_train['relevance'].values

X_train = df_train.drop(['id','relevance'],axis=1).values # 将y去掉,axis=1代表以列去除的
X_test = df_test.drop(['id','relevance'],axis=1).values
#Step 5: 建立模型

from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import cross_val_score # 将训练集分成5份,将1份做训练,4份做测试,将测试的结果平均(交叉验证)

params = [1,3,5,6,7,8,9,10] # 手写的网格搜索
test_scores = []
for param in params:
    clf = RandomForestRegressor(n_estimators=30, max_depth=param) # 最大深度为超参数
    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=5, scoring='neg_mean_squared_error')) 
    test_scores.append(np.mean(test_score)) # 四个结果平均

import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(params, test_scores)
plt.title("Param vs CV Error");


#Step 6: 上传结果

rf = RandomForestRegressor(n_estimators=30, max_depth=6)
rf.fit(X_train, y_train)
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=30, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
y_pred = rf.predict(X_test)

pd.DataFrame({"id": test_ids, "relevance": y_pred}).to_csv('submission.csv',index=False)


#文本预处理步骤: 你可以使用很多不同的方法来使得文本数据变得更加清洁

#自制的特征: 相处更多的特征值表达方法(关键词全段重合数量,重合比率,等等)

#更好的回归模型: 根据之前的课讲的Ensemble方法,把分类器提升到极致

案例:Bags of Words Meets Bags of Popcorn


import os
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier # 随机森林
from sklearn.metrics import confusion_matrix # 混淆矩阵
import nltk
from nltk.corpus import stopwords

# 用pandas读入训练数据
datafile = os.path.join('..', 'data', 'labeledTrainData.tsv')
df = pd.read_csv(datafile, sep='\t', escapechar='\\')
print('Number of reviews: {}'.format(len(df)))
#Number of reviews: 25000
#id	sentiment	review
#0	5814_8	1	With all this stuff going down at the moment w...
#1	2381_9	1	"The Classic War of the Worlds" by Timothy Hin...
#2	7759_3	0	The film starts with a manager (Nicholas Bell)...
#3	3630_4	0	It must be assumed that those who praised this...
#4	9495_8	1	Superbly trashy and wondrously unpretentious 8...
def display(text, title):
raw_example = df['review'][1]
display(raw_example, '原始数据')


"The Classic War of the Worlds" by Timothy Hines is a very entertaining film that obviously goes to great effort and lengths to faithfully recreate H. G. Wells' classic book. Mr. Hines succeeds in doing so. I, and those who watched his film with me, appreciated the fact that it was not the standard, predictable Hollywood fare that comes out every year, e.g. the Spielberg version with Tom Cruise that had only the slightest resemblance to the book. Obviously, everyone looks for different things in a movie. Those who envision themselves as amateur "critics" look only to criticize everything they can. Others rate a movie on more important bases,like being entertained, which is why most people never agree with the "critics". We enjoyed the effort Mr. Hines put into being faithful to H.G. Wells' classic novel, and we found it to be very entertaining. This made it easy to overlook what the "critics" perceive to be its shortcomings.
example = BeautifulSoup(raw_example, 'html.parser').get_text() # 使用bs解析文本
display(example, '去掉HTML标签的数据')


"The Classic War of the Worlds" by Timothy Hines is a very entertaining film that obviously goes to great effort and lengths to faithfully recreate H. G. Wells' classic book. Mr. Hines succeeds in doing so. I, and those who watched his film with me, appreciated the fact that it was not the standard, predictable Hollywood fare that comes out every year, e.g. the Spielberg version with Tom Cruise that had only the slightest resemblance to the book. Obviously, everyone looks for different things in a movie. Those who envision themselves as amateur "critics" look only to criticize everything they can. Others rate a movie on more important bases,like being entertained, which is why most people never agree with the "critics". We enjoyed the effort Mr. Hines put into being faithful to H.G. Wells' classic novel, and we found it to be very entertaining. This made it easy to overlook what the "critics" perceive to be its shortcomings.
example_letters = re.sub(r'[^a-zA-Z]', ' ', example) # 使用正则表达式除掉出了字母之外的所有字符(换成空格)
display(example_letters, '去掉标点的数据')


 The Classic War of the Worlds  by Timothy Hines is a very entertaining film that obviously goes to great effort and lengths to faithfully recreate H  G  Wells  classic book  Mr  Hines succeeds in doing so  I  and those who watched his film with me  appreciated the fact that it was not the standard  predictable Hollywood fare that comes out every year  e g  the Spielberg version with Tom Cruise that had only the slightest resemblance to the book  Obviously  everyone looks for different things in a movie  Those who envision themselves as amateur  critics  look only to criticize everything they can  Others rate a movie on more important bases like being entertained  which is why most people never agree with the  critics   We enjoyed the effort Mr  Hines put into being faithful to H G  Wells  classic novel  and we found it to be very entertaining  This made it easy to overlook what the  critics  perceive to be its shortcomings 
words = example_letters.lower().split() # 做一个小写化,并用空格分成一个list
display(words, '纯词列表数据')


[u'the', u'classic', u'war', u'of', u'the', u'worlds', u'by', u'timothy', u'hines', u'is', u'a', u'very', u'entertaining', u'film', u'that', u'obviously', u'goes', u'to', u'great', u'effort', u'and', u'lengths', u'to', u'faithfully', u'recreate', u'h', u'g', u'wells', u'classic', u'book', u'mr', u'hines', u'succeeds', u'in', u'doing', u'so', u'i', u'and', u'those', u'who', u'watched', u'his', u'film', u'with', u'me', u'appreciated', u'the', u'fact', u'that', u'it', u'was', u'not', u'the', u'standard', u'predictable', u'hollywood', u'fare', u'that', u'comes', u'out', u'every', u'year', u'e', u'g', u'the', u'spielberg', u'version', u'with', u'tom', u'cruise', u'that', u'had', u'only', u'the', u'slightest', u'resemblance', u'to', u'the', u'book', u'obviously', u'everyone', u'looks', u'for', u'different', u'things', u'in', u'a', u'movie', u'those', u'who', u'envision', u'themselves', u'as', u'amateur', u'critics', u'look', u'only', u'to', u'criticize', u'everything', u'they', u'can', u'others', u'rate', u'a', u'movie', u'on', u'more', u'important', u'bases', u'like', u'being', u'entertained', u'which', u'is', u'why', u'most', u'people', u'never', u'agree', u'with', u'the', u'critics', u'we', u'enjoyed', u'the', u'effort', u'mr', u'hines', u'put', u'into', u'being', u'faithful', u'to', u'h', u'g', u'wells', u'classic', u'novel', u'and', u'we', u'found', u'it', u'to', u'be', u'very', u'entertaining', u'this', u'made', u'it', u'easy', u'to', u'overlook', u'what', u'the', u'critics', u'perceive', u'to', u'be', u'its', u'shortcomings']
#nltk.download() # 可以直接使用nltk里的停用词
#words_nostop = [w for w in words if w not in stopwords.words('english')]
stopwords = {}.fromkeys([ line.rstrip() for line in open('../stopwords.txt')])
words_nostop = [w for w in words if w not in stopwords] # q去除非停用词
display(words_nostop, '去掉停用词数据')


[u'classic', u'war', u'worlds', u'timothy', u'hines', u'entertaining', u'film', u'effort', u'lengths', u'faithfully', u'recreate', u'classic', u'book', u'hines', u'succeeds', u'watched', u'film', u'appreciated', u'standard', u'predictable', u'hollywood', u'fare', u'spielberg', u'version', u'tom', u'cruise', u'slightest', u'resemblance', u'book', u'movie', u'envision', u'amateur', u'critics', u'criticize', u'rate', u'movie', u'bases', u'entertained', u'people', u'agree', u'critics', u'enjoyed', u'effort', u'hines', u'faithful', u'classic', u'entertaining', u'easy', u'overlook', u'critics', u'perceive', u'shortcomings']
#eng_stopwords = set(stopwords.words('english'))
eng_stopwords = set(stopwords)#整理到一个函数中
def clean_text(text):
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = text.lower().split()
    words = [w for w in words if w not in eng_stopwords]
    return ' '.join(words)
u'classic war worlds timothy hines entertaining film effort lengths faithfully recreate classic book hines succeeds watched film appreciated standard predictable hollywood fare spielberg version tom cruise slightest resemblance book movie envision amateur critics criticize rate movie bases entertained people agree critics enjoyed effort hines faithful classic entertaining easy overlook critics perceive shortcomings'
# 清洗数据添加到dataframe里
df['clean_review'] = df.review.apply(clean_text) # 对每一行都做清洗处理
id	sentiment	review	clean_review
0	5814_8	1	With all this stuff going down at the moment w...	stuff moment mj ve started listening music wat...
1	2381_9	1	"The Classic War of the Worlds" by Timothy Hin...	classic war worlds timothy hines entertaining ...
2	7759_3	0	The film starts with a manager (Nicholas Bell)...	film starts manager nicholas bell investors ro...
3	3630_4	0	It must be assumed that those who praised this...	assumed praised film filmed opera didn read do...
4	9495_8	1	Superbly trashy and wondrously unpretentious 8...	superbly trashy wondrously unpretentious explo...

#抽取bag of words特征(用sklearn的CountVectorizer)(参考之前的代码,这里是规定了一个含有5000个词的词向量)
vectorizer = CountVectorizer(max_features = 5000) 
train_data_features = vectorizer.fit_transform(df.clean_review).toarray()
(25000, 5000)

# 随机森林训练分类器
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(train_data_features, df.sentiment)
confusion_matrix(df.sentiment, forest.predict(train_data_features))
array([[12500,     0],
       [    0, 12500]])
del df
del train_data_features

datafile = os.path.join('..', 'data', 'testData.tsv')
df = pd.read_csv(datafile, sep='\t', escapechar='\\')
print('Number of reviews: {}'.format(len(df)))
df['clean_review'] = df.review.apply(clean_text)
Number of reviews: 25000
id	review	clean_review
0	12311_10	Naturally in a film who's main themes are of m...	naturally film main themes mortality nostalgia...
1	8348_2	This movie is a disaster within a disaster fil...	movie disaster within disaster film full great...
2	5828_4	All in all, this is a movie for kids. We saw i...	movie kids saw tonight child loved one point k...
3	7186_2	Afraid of the Dark left me with the impression...	afraid dark left impression several different ...
4	12128_7	A very accurate depiction of small time mob li...	accurate depiction small time mob life filmed ...
test_data_features = vectorizer.transform(df.clean_review).toarray()
(25000, 5000)
result = forest.predict(test_data_features)
output = pd.DataFrame({'id':df.id, 'sentiment':result})
id	sentiment
0	12311_10	1
1	8348_2	0
2	5828_4	1
3	7186_2	1
4	12128_7	1
output.to_csv(os.path.join('..', 'data', 'Bag_of_Words_model.csv'), index=False)
del df
del test_data_features

import os
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import nltk.data
#from nltk.corpus import stopwordsfrom gensim.models.word2vec import Word2Vec
def load_dataset(name, nrows=None):
    datasets = {
        'unlabeled_train': 'unlabeledTrainData.tsv',
        'labeled_train': 'labeledTrainData.tsv',
        'test': 'testData.tsv'
    if name not in datasets:
        raise ValueError(name)
    data_file = os.path.join('..', 'data', datasets[name])
    df = pd.read_csv(data_file, sep='\t', escapechar='\\', nrows=nrows)
    print('Number of reviews: {}'.format(len(df)))
    return df

df = load_dataset('unlabeled_train')
Number of reviews: 50000
id	review
0	9999_0	Watching Time Chasers, it obvious that it was ...
1	45057_0	I saw this film about 20 years ago and remembe...
2	15561_0	Minor Spoilers

In New York, Joan Ba... 3 7161_0 I went to see this film with a great deal of e... 4 43971_0 Yes, I agree with everyone on this site this m... '''
# 和第一个ipython notebook一样做数据的预处理 
# 稍稍有一点不一样的是,我们留了个候选,可以去除停用词,也可以不去除停用词 
# eng_stopwords = set(stopwords.words('english')) 
eng_stopwords = {}.fromkeys([ line.rstrip() for line in open('../stopwords.txt')])

def clean_text(text, remove_stopwords=False): 
    text = BeautifulSoup(text, 'html.parser').get_text() 
    text = re.sub(r'[^a-zA-Z]', ' ', text) 
    words = text.lower().split() 
    if remove_stopwords: 
        words = [w for w in words if w not in eng_stopwords] 
    return words 

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def print_call_counts(f): 
    n = 0 
    def wrapped(*args, **kwargs): 
        nonlocal n 
        n += 1 
        if n % 1000 == 1: 
            print('method {} called {} times'.format(f.__name__, n)) 
        return f(*args, **kwargs) 
    return wrapped 

@print_call_counts 
def split_sentences(review): 
    raw_sentences = tokenizer.tokenize(review.strip()) 
    sentences = [clean_text(s) for s in raw_sentences if s] 
    return sentences 

%time sentences = sum(df.review.apply(split_sentences), []) 
print('{} reviews -> {} sentences'.format(len(df), len(sentences))) 

# 用gensim训练词嵌入模型 
import logging 
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 

# 设定词向量训练的参数,维度尽量300以上 
num_features = 300 # Word vector dimensionality 
min_word_count = 40 # Minimum word count 
num_workers = 4 # Number of threads to run in parallel 线程 
context = 10 # Context window size 上下文窗口大小 
downsampling = 1e-3 # Downsample setting for frequent words 

model_name = '{}features_{}minwords_{}context.model'.format(num_features, min_word_count, context) 
print('Training model...') 
model = word2vec.Word2Vec(sentences, workers=num_workers, \ 
size=num_features, min_count = min_word_count, \ 
window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient. 
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load() 
model.save(os.path.join('..', 'models', model_name)) You can load it later using Word2Vec.load() model.save(os.path.join('..', 'models', model_name)) Training model... # 看看训练的词向量结果如何 print(model.doesnt_match("man woman child kitchen".split())) print(model.doesnt_match('france england germany berlin'.split())) kitchen berlin model.most_similar("man") ''' [('woman', 0.6256189346313477), ('lady', 0.5953349471092224), ('lad', 0.576863169670105), ('person', 0.5407935380935669), ('farmer', 0.5382746458053589), ('chap', 0.536788821220398), ('soldier', 0.5292650461196899), ('men', 0.5261573791503906), ('monk', 0.5237958431243896), ('guy', 0.5213091373443604)] ''' model.most_similar("queen") ''' [('princess', 0.6749982833862305), ('maid', 0.6223365068435669), ('bride', 0.6201028227806091), ('belle', 0.6200867891311646), ('temple', 0.6171057224273682), ('stripper', 0.608874499797821), ('catherine', 0.6072724461555481), ('eva', 0.6019693613052368), ('dancer', 0.594109833240509), ('sylvia', 0.5933606624603271)] ''' model.most_similar("awful") ''' [('terrible', 0.7551683187484741), ('atrocious', 0.7340768575668335), ('horrible', 0.7315883040428162), ('dreadful', 0.7080680131912231), ('abysmal', 0.7010548114776611), ('horrendous', 0.6951696872711182), ('appalling', 0.691646933555603), ('horrid', 0.6708598136901855), ('amateurish', 0.6481891870498657), ('embarrassing', 0.6306308507919312)] ''' #在word2vec上训练情感分析模型 import os import re import numpy as np import pandas as pd ​ from bs4 import BeautifulSoup ​ #from nltk.corpus import stopwordsfrom gensim.models.word2vec import Word2Vec ​ from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import confusion_matrix from sklearn.cluster import KMeans #和之前的操作一致 def load_dataset(name, nrows=None): datasets = { 'unlabeled_train': 'unlabeledTrainData.tsv', 'labeled_train': 'labeledTrainData.tsv', 'test': 'testData.tsv' } if name not in datasets: raise ValueError(name) data_file = os.path.join('..', 'data', datasets[name]) df = pd.read_csv(data_file, sep='\t', escapechar='\\', nrows=nrows) print('Number of reviews: {}'.format(len(df))) return df eng_stopwords = set(stopwords.words('english'))def clean_text(text, remove_stopwords=False): text = BeautifulSoup(text, 'html.parser').get_text() text = re.sub(r'[^a-zA-Z]', ' ', text) words = text.lower().split() if remove_stopwords: words = [w for w in words if w not in eng_stopwords] return words #读入之前训练好的Word2Vec模型 model_name = '300features_40minwords_10context.model' model = Word2Vec.load(os.path.join('..', 'models', model_name)) #我们可以根据word2vec的结果去对影评文本进行编码 #编码方式有一点粗暴,简单说来就是把这句话中的词的词向量做平均 df = load_dataset('labeled_train') df.head() ''' Number of reviews: 25000 id sentiment review 0 5814_8 1 With all this stuff going down at the moment w... 1 2381_9 1 "The Classic War of the Worlds" by Timothy Hin... 2 7759_3 0 The film starts with a manager (Nicholas Bell)... 3 3630_4 0 It must be assumed that those who praised this... 4 9495_8 1 Superbly trashy and wondrously unpretentious 8... ''' def to_review_vector(review): words = clean_text(review, remove_stopwords=True) array = np.array([model[w] for w in words if w in model]) return pd.Series(array.mean(axis=0)) train_data_features = df.review.apply(to_review_vector) train_data_features.head() ''' 0 1 2 3 4 5 6 7 8 9 ... 290 291 292 293 294 295 296 297 298 299 0 -0.005454 -0.006304 0.019711 0.002696 -0.009633 -0.007165 -0.000097 0.012100 0.023512 0.005115 ... -0.003228 -0.000991 -0.002044 0.005908 0.005332 0.005427 0.004184 -0.007288 0.027719 0.011447 1 -0.011847 -0.002713 0.041218 -0.018987 -0.018241 -0.021992 -0.027039 0.023703 0.054601 0.004215 ... -0.024287 0.010150 -0.012596 -0.016019 0.000792 -0.002985 -0.009311 -0.011830 0.014108 0.022902 2 -0.028175 0.001474 0.008125 -0.019340 -0.038524 -0.017802 -0.031166 0.000145 0.038809 0.003583 ... 0.002452 0.004443 -0.015119 0.010723 -0.011887 0.021536 0.013621 -0.013268 0.019888 0.003641 3 -0.024626 -0.006715 0.032918 -0.020560 -0.037079 -0.021495 -0.022226 -0.006984 0.047868 0.006594 ... -0.002942 0.017494 -0.016277 -0.006731 0.000734 0.011033 -0.004642 0.004115 0.013974 0.013784 4 -0.019951 -0.002109 0.010210 -0.016458 -0.034194 -0.019208 -0.000223 -0.006509 0.024472 0.006015 ... 0.002908 0.004384 -0.006123 0.007581 -0.006920 0.019001 0.009619 -0.007976 0.020669 -0.004658 5 rows × 300 columns ''' #用随机森林构建分类器 forest = RandomForestClassifier(n_estimators = 100, random_state=42) forest = forest.fit(train_data_features, df.sentiment) #同样在训练集上试试,确保模型能正常work confusion_matrix(df.sentiment, forest.predict(train_data_features)) ''' array([[12500, 0], [ 0, 12500]]) ''' #清理占用内容的变量 del df del train_data_features #预测测试集结果并上传kaggle df = load_dataset('test') df.head() ''' Number of reviews: 25000 id review 0 12311_10 Naturally in a film who's main themes are of m... 1 8348_2 This movie is a disaster within a disaster fil... 2 5828_4 All in all, this is a movie for kids. We saw i... 3 7186_2 Afraid of the Dark left me with the impression... 4 12128_7 A very accurate depiction of small time mob li... ''' test_data_features = df.review.apply(to_review_vector) test_data_features.head() 0 1 2 3 4 5 6 7 8 9 ... 290 291 292 293 294 295 296 297 298 299 0 -0.019753 -0.005689 0.015961 -0.038633 -0.041745 -0.044680 -0.012790 0.004908 0.053838 0.008490 ... -0.005520 0.034378 -0.027250 0.010244 -0.008976 0.010181 -0.027196 0.010429 0.021153 0.015764 1 0.000497 -0.004140 0.019237 0.011341 -0.020860 -0.013085 -0.005469 0.015154 0.022737 0.009717 ... 0.005757 0.018115 -0.010495 -0.007650 0.000969 0.018796 -0.003173 0.001657 0.014491 0.026732 2 -0.015999 -0.012097 0.022069 -0.014368 -0.020226 -0.015809 -0.000826 0.010130 0.033976 0.005700 ... 0.001799 0.012403 -0.022812 0.011651 0.001775 0.009241 0.003241 -0.002865 0.027701 0.028418 3 -0.015196 -0.013445 0.010499 -0.035669 -0.040131 -0.018273 -0.020452 -0.003197 0.026555 0.008284 ... 0.011720 0.010397 -0.029256 0.007422 -0.000662 0.020593 0.001274 -0.014059 0.024905 0.024326 4 -0.016140 -0.015608 0.010962 -0.008424 -0.022619 -0.022396 -0.018043 0.012519 0.032103 0.009743 ... -0.001820 0.004578 -0.008875 0.009702 -0.012013 0.010689 -0.003468 -0.003109 0.026661 0.005735 5 rows × 300 columns result = forest.predict(test_data_features) output = pd.DataFrame({'id':df.id, 'sentiment':result}) output.to_csv(os.path.join('..', 'data', 'Word2Vec_model.csv'), index=False) output.head() ''' id sentiment 0 12311_10 1 1 8348_2 0 2 5828_4 0 3 7186_2 0 4 12128_7 1 ''' del df del test_data_features del forest #对词向量进行聚类研究和编码 #使用Kmeans进行聚类 word_vectors = model.syn0 num_clusters = word_vectors.shape[0] // 10 %%time ​ kmeans_clustering = KMeans(n_clusters = num_clusters, n_jobs=4) idx = kmeans_clustering.fit_predict(word_vectors) ''' CPU times: user 2.03 s, sys: 377 ms, total: 2.41 s Wall time: 13min 19s ''' word_centroid_map = dict(zip(model.index2word, idx)) import pickle ​ filename = 'word_centroid_map_10avg.pickle' with open(os.path.join('..', 'models', filename), 'bw') as f: pickle.dump(word_centroid_map, f) #with open(os.path.join('..', 'models', filename), 'br') as f: # word_centroid_map = pickle.load(f) 输出一些clusters看 for cluster in range(0,10): print("\nCluster %d" % cluster) print([w for w,c in word_centroid_map.items() if c == cluster]) ''' Cluster 0 ['praised', 'appreciated', 'noted', 'avoided', 'criticized', 'admired'] Cluster 1 ['misfit', 'con', 'hoodlum', 'spy', 'rogue'] Cluster 2 ['contrasts', 'healthy', 'glamour', 'eroticism', 'sensual'] Cluster 3 ['matthew', 'kingsley', 'klein', 'hackman', 'meyers', 'perry', 'simpson', 'pullman', 'dana', 'olsen', 'ryan', 'barrie', 'caan', 'tho', 'farina', 'stiller', 'hutton', 'sparks', 'lillard', 'broderick', 'kline', 'reprise', 'mcconaughey', 'carvey', 'harrelson'] Cluster 4 ['wolves', 'papillon', 'continent'] Cluster 5 ['tick', 'drain', 'nailed', 'puke', 'boil', 'stalk'] Cluster 6 ['cotton', 'denver', 'windsor', 'marsh', 'bell'] Cluster 7 ['lighting', 'costumes', 'sfx', 'props', 'design', 'costuming', 'designs', 'makeup'] Cluster 8 ['decline', 'swashbuckling', 'swashbuckler', 'prestige', 'potboiler', 'latter', 'glory', 'untouchables', 'fame'] Cluster 9 ['slashed', 'butchered', 'mutilated', 'eaten', 'slaughtered', 'continually'] ''' #把评论数据转成cluster bag vectors wordset = set(word_centroid_map.keys())def make_cluster_bag(review): words = clean_text(review, remove_stopwords=True) return (pd.Series([word_centroid_map[w] for w in words if w in wordset]) .value_counts() .reindex(range(num_clusters+1), fill_value=0)) df = load_dataset('labeled_train') df.head() ''' Number of reviews: 25000 id sentiment review 0 5814_8 1 With all this stuff going down at the moment w... 1 2381_9 1 "The Classic War of the Worlds" by Timothy Hin... 2 7759_3 0 The film starts with a manager (Nicholas Bell)... 3 3630_4 0 It must be assumed that those who praised this... 4 9495_8 1 Superbly trashy and wondrously unpretentious 8... ''' train_data_features = df.review.apply(make_cluster_bag) train_data_features.head() ''' 0 1 2 3 4 5 6 7 8 9 ... 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 0 0 0 0 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 1 0 0 0 ... 0 1 0 0 0 0 0 0 0 0 3 1 0 0 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 0 0 1 ... 0 5 0 0 0 0 0 0 0 0 5 rows × 1306 columns ''' #再用随机森林算法建模¶ forest = RandomForestClassifier(n_estimators = 100, random_state=42) forest = forest.fit(train_data_features, df.sentiment) #在训练集上试一试效果 confusion_matrix(df.sentiment, forest.predict(train_data_features)) ''' array([[12500, 0], [ 0, 12500]]) ''' #去掉无用的占内存的量 del df del train_data_features #载入测试数据做预测 df = load_dataset('test') df.head() ''' Number of reviews: 25000 id review 0 12311_10 Naturally in a film who's main themes are of m... 1 8348_2 This movie is a disaster within a disaster fil... 2 5828_4 All in all, this is a movie for kids. We saw i... 3 7186_2 Afraid of the Dark left me with the impression... 4 12128_7 A very accurate depiction of small time mob li... ''' test_data_features = df.review.apply(make_cluster_bag) test_data_features.head() ''' 0 1 2 3 4 5 6 7 8 9 ... 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 ... 0 0 1 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 5 rows × 1306 columns ''' result = forest.predict(test_data_features) output = pd.DataFrame({'id':df.id, 'sentiment':result}) output.to_csv(os.path.join('..', 'data', 'Word2Vec_BagOfClusters.csv'), index=False) output.head() ''' id sentiment 0 12311_10 1 1 8348_2 0 2 5828_4 1 3 7186_2 0 4 12128_7 1 ''' del df del test_data_features del forest del df del test_data_features del forest



# -*- coding: utf-8 -*-
Created on 2015.12.09
@author: Hanxiaoyang
from sklearn.cross_validation import train_test_split # 数据集分割
from gensim.models.word2vec import Word2Vec #构建词向量
import numpy as np
import pandas as pd
import jieba # jieba
from sklearn.externals import joblib # 数据转换二进制
from sklearn.svm import SVC
import sys  

# 载入数据,做预处理(分词),切分训练集与测试集
def load_file_and_preprocessing():
    neg=pd.read_excel('data/neg.xls',header=None,index=None) #负样本
    pos=pd.read_excel('data/pos.xls',header=None,index=None) # 正样本
    cw = lambda x: list(jieba.cut(x)) # 分词
    pos['words'] = pos[0].apply(cw)
    neg['words'] = neg[0].apply(cw)#print pos['words']
    #use 1 for positive sentiment, 0 for negative
    y = np.concatenate((np.ones(len(pos)), np.zeros(len(neg))))
    x_train, x_test, y_train, y_test = train_test_split(np.concatenate((pos['words'], neg['words'])), y, test_size=0.2) #训练集分割
    return x_train,x_test
# 还可以使用TF-IDF
def build_sentence_vector(text, size,imdb_w2v): 
    vec = np.zeros(size).reshape((1, size)) #
    count = 0.
    for word in text:
            vec += imdb_w2v[word].reshape((1, size))
            count += 1.
        except KeyError:
    if count != 0:
        vec /= count #取平均
    return vec
def get_train_vecs(x_train,x_test):
    n_dim = 300
    imdb_w2v = Word2Vec(size=n_dim, min_count=10)
    train_vecs = np.concatenate([build_sentence_vector(z, n_dim,imdb_w2v) for z in x_train])
    #train_vecs = scale(train_vecs)
    print train_vecs.shape
    #Build test tweet vectors then scale
    test_vecs = np.concatenate([build_sentence_vector(z, n_dim,imdb_w2v) for z in x_test])
    #test_vecs = scale(test_vecs)
    print test_vecs.shape
def get_data():
    return train_vecs,y_train,test_vecs,y_test
def svm_train(train_vecs,y_train,test_vecs,y_test):
    joblib.dump(clf, 'svm_data/svm_model/model.pkl')
    print clf.score(test_vecs,y_test)
def get_predict_vecs(words):
    n_dim = 300
    imdb_w2v = Word2Vec.load('svm_data/w2v_model/w2v_model.pkl')
    train_vecs = build_sentence_vector(words, n_dim,imdb_w2v)
    #print train_vecs.shape
    return train_vecs
def svm_predict(string):
    if int(result[0])==1:
        print string,' positive'
        print string,' negative'




import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from datetime import date

data = pd.read_csv('../input/Combined_News_DJIA.csv')

Date	Label	Top1	Top2	Top3	Top4	Top5	Top6	Top7	Top8	...	Top16	Top17	Top18	Top19	Top20	Top21	Top22	Top23	Top24	Top25
0	2008-08-08	0	b"Georgia 'downs two Russian warplanes' as cou...	b'BREAKING: Musharraf to be impeached.'	b'Russia Today: Columns of troops roll into So...	b'Russian tanks are moving towards the capital...	b"Afghan children raped with 'impunity,' U.N. ...	b'150 Russian tanks have entered South Ossetia...	b"Breaking: Georgia invades South Ossetia, Rus...	b"The 'enemy combatent' trials are nothing but...	...	b'Georgia Invades South Ossetia - if Russia ge...	b'Al-Qaeda Faces Islamist Backlash'	b'Condoleezza Rice: "The US would not act to p...	b'This is a busy day: The European Union has ...	b"Georgia will withdraw 1,000 soldiers from Ir...	b'Why the Pentagon Thinks Attacking Iran is a ...	b'Caucasus in crisis: Georgia invades South Os...	b'Indian shoe manufactory - And again in a se...	b'Visitors Suffering from Mental Illnesses Ban...	b"No Help for Mexico's Kidnapping Surge"
1	2008-08-11	1	b'Why wont America and Nato help us? If they w...	b'Bush puts foot down on Georgian conflict'	b"Jewish Georgian minister: Thanks to Israeli ...	b'Georgian army flees in disarray as Russians ...	b"Olympic opening ceremony fireworks 'faked'"	b'What were the Mossad with fraudulent New Zea...	b'Russia angered by Israeli military sale to G...	b'An American citizen living in S.Ossetia blam...	...	b'Israel and the US behind the Georgian aggres...	b'"Do not believe TV, neither Russian nor Geor...	b'Riots are still going on in Montreal (Canada...	b'China to overtake US as largest manufacturer'	b'War in South Ossetia [PICS]'	b'Israeli Physicians Group Condemns State Tort...	b' Russia has just beaten the United States ov...	b'Perhaps *the* question about the Georgia - R...	b'Russia is so much better at war'	b"So this is what it's come to: trading sex fo...
2	2008-08-12	0	b'Remember that adorable 9-year-old who sang a...	b"Russia 'ends Georgia operation'"	b'"If we had no sexual harassment we would hav...	b"Al-Qa'eda is losing support in Iraq because ...	b'Ceasefire in Georgia: Putin Outmaneuvers the...	b'Why Microsoft and Intel tried to kill the XO...	b'Stratfor: The Russo-Georgian War and the Bal...	b"I'm Trying to Get a Sense of This Whole Geor...	...	b'U.S. troops still in Georgia (did you know t...	b'Why Russias response to Georgia was right'	b'Gorbachev accuses U.S. of making a "serious ...	b'Russia, Georgia, and NATO: Cold War Two'	b'Remember that adorable 62-year-old who led y...	b'War in Georgia: The Israeli connection'	b'All signs point to the US encouraging Georgi...	b'Christopher King argues that the US and NATO...	b'America: The New Mexico?'	b"BBC NEWS | Asia-Pacific | Extinction 'by man...
3	2008-08-13	0	b' U.S. refuses Israel weapons to attack Iran:...	b"When the president ordered to attack Tskhinv...	b' Israel clears troops who killed Reuters cam...	b'Britain\'s policy of being tough on drugs is...	b'Body of 14 year old found in trunk; Latest (...	b'China has moved 10 *million* quake survivors...	b"Bush announces Operation Get All Up In Russi...	b'Russian forces sink Georgian ships '	...	b'Elephants extinct by 2020?'	b'US humanitarian missions soon in Georgia - i...	b"Georgia's DDOS came from US sources"	b'Russian convoy heads into Georgia, violating...	b'Israeli defence minister: US against strike ...	b'Gorbachev: We Had No Choice'	b'Witness: Russian forces head towards Tbilisi...	b' Quarter of Russians blame U.S. for conflict...	b'Georgian president says US military will ta...	b'2006: Nobel laureate Aleksander Solzhenitsyn...
4	2008-08-14	1	b'All the experts admit that we should legalis...	b'War in South Osetia - 89 pictures made by a ...	b'Swedish wrestler Ara Abrahamian throws away ...	b'Russia exaggerated the death toll in South O...	b'Missile That Killed 9 Inside Pakistan May Ha...	b"Rushdie Condemns Random House's Refusal to P...	b'Poland and US agree to missle defense deal. ...	b'Will the Russians conquer Tblisi? Bet on it,...	...	b'Bank analyst forecast Georgian crisis 2 days...	b"Georgia confict could set back Russia's US r...	b'War in the Caucasus is as much the product o...	b'"Non-media" photos of South Ossetia/Georgia ...	b'Georgian TV reporter shot by Russian sniper ...	b'Saudi Arabia: Mother moves to block child ma...	b'Taliban wages war on humanitarian aid workers'	b'Russia: World "can forget about" Georgia\'s...	b'Darfur rebels accuse Sudan of mounting major...	b'Philippines : Peace Advocate say Muslims nee...
5 rows × 27 columns

这下,我们可以先把数据给分成Training/Testing data

train = data[data['Date'] < '2015-01-01']
test = data[data['Date'] > '2014-12-31']

X_train = train[train.columns[2:]]
corpus = X_train.values.flatten().astype(str)

X_train = X_train.values.astype(str)
X_train = np.array([' '.join(x) for x in X_train])
X_test = test[test.columns[2:]]
X_test = X_test.values.astype(str)
X_test = np.array([' '.join(x) for x in X_test])
y_train = train['Label'].values
y_test = test['Label'].values

corpus是全部我们『可见』的文本资料。我们假设每条新闻就是一句话,把他们全部flatten()了,我们就会得到list of sentences。


array([ 'b"Georgia \'downs two Russian warplanes\' as countries move to brink of war"',
       "b'BREAKING: Musharraf to be impeached.'",
       "b'Russia Today: Columns of troops roll into South Ossetia; footage from fighting (YouTube)'"], 
array([ 'b"Georgia \'downs two Russian warplanes\' as countries move to brink of war" b\'BREAKING: Musharraf to be impeached.\' b\'Russia Today: Columns of troops roll into South Ossetia; footage from fighting (YouTube)\' b\'Russian tanks are moving towards the capital of South Ossetia, which has reportedly been completely destroyed by Georgian artillery fire\' b"Afghan children raped with \'impunity,\' U.N. official says - this is sick, a three year old was raped and they do nothing" b\'150 Russian tanks have entered South Ossetia whilst Georgia shoots down two Russian jets.\' b"Breaking: Georgia invades South Ossetia, Russia warned it would intervene on SO\'s side" b"The \'enemy combatent\' trials are nothing but a sham: Salim Haman has been sentenced to 5 1/2 years, but will be kept longer anyway just because they feel like it." b\'Georgian troops retreat from S. Osettain capital, presumably leaving several hundred people killed. [VIDEO]\' b\'Did the U.S. Prep Georgia for War with Russia?\' b\'Rice Gives Green Light for Israel to Attack Iran: Says U.S. has no veto over Israeli military ops\' b\'Announcing:Class Action Lawsuit on Behalf of American Public Against the FBI\' b"So---Russia and Georgia are at war and the NYT\'s top story is opening ceremonies of the Olympics?  What a fucking disgrace and yet further proof of the decline of journalism." b"China tells Bush to stay out of other countries\' affairs" b\'Did World War III start today?\' b\'Georgia Invades South Ossetia - if Russia gets involved, will NATO absorb Georgia and unleash a full scale war?\' b\'Al-Qaeda Faces Islamist Backlash\' b\'Condoleezza Rice: "The US would not act to prevent an Israeli strike on Iran." Israeli Defense Minister Ehud Barak: "Israel is prepared for uncompromising victory in the case of military hostilities."\' b\'This is a busy day:  The European Union has approved new sanctions against Iran in protest at its nuclear programme.\' b"Georgia will withdraw 1,000 soldiers from Iraq to help fight off Russian forces in Georgia\'s breakaway region of South Ossetia" b\'Why the Pentagon Thinks Attacking Iran is a Bad Idea - US News & World Report\' b\'Caucasus in crisis: Georgia invades South Ossetia\' b\'Indian shoe manufactory  - And again in a series of "you do not like your work?"\' b\'Visitors Suffering from Mental Illnesses Banned from Olympics\' b"No Help for Mexico\'s Kidnapping Surge"'], 
array([0, 1, 0, 0, 1])


from nltk.tokenize import word_tokenize

corpus = [word_tokenize(x) for x in corpus]
X_train = [word_tokenize(x) for x in X_train]
X_test = [word_tokenize(x) for x in X_test]










# 停止词
from nltk.corpus import stopwords
stop = stopwords.words('english')

# 数字
import re
def hasNumbers(inputString):
    return bool(re.search(r'\d', inputString))

# 特殊符号
def isSymbol(inputString):
    return bool(re.match(r'[^\w]', inputString))

# lemma
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def check(word):
    word= word.lower()
    if word in stop:
        return False
    elif hasNumbers(word) or isSymbol(word):
        return False
        return True

# 把上面的方法综合起来
def preprocessing(sen):
    res = []
    for word in sen:
        if check(word):
            # 这一段的用处仅仅是去除python里面byte存str时候留下的标识。。之前数据没处理好,其他case里不会有这个情况
            word = word.lower().replace("b'", '').replace('b"', '').replace('"', '').replace("'", '')
    return res

corpus = [preprocessing(x) for x in corpus]
X_train = [preprocessing(x) for x in X_train]
X_test = [preprocessing(x) for x in X_test]

['north', 'korean', 'leader', 'kim', 'jong-il', 'confirmed', 'ill']
['two', 'redditors', 'climbing', 'mt', 'kilimanjaro', 'charity', 'bidding', 'peak', 'nt', 'squander', 'opportunity', 'let', 'upvotes', 'something', 'awesome', 'estimated', 'take', 'year', 'clear', 'lao', 'explosive', 'remnant', 'left', 'behind', 'united', 'state', 'bomber', 'year', 'ago', 'people', 'died', 'unexploded', 'ordnance', 'since', 'conflict', 'ended', 'fidel', 'ahmadinejad', 'slandering', 'jew', 'mossad', 'america', 'israel', 'intelligence', 'agency', 'target', 'united', 'state', 'intensively', 'among', 'nation', 'considered', 'friendly', 'washington', 'israel', 'lead', 'others', 'active', 'espionage', 'directed', 'american', 'company', 'defense', 'department', 'australian', 'election', 'day', 'poll', 'rural/regional', 'independent', 'member', 'parliament', 'support', 'labor', 'minority', 'goverment', 'julia', 'gillard', 'prime', 'minister', 'france', 'plan', 'raise', 'retirement', 'age', 'set', 'strike', 'britain', 'parliament', 'police', 'murdoch', 'paper', 'adviser', 'pm', 'implicated', 'voicemail', 'hacking', 'scandal', 'british', 'policeman', 'jailed', 'month', 'cell', 'attack', 'woman', 'rest', 'email', 'display', 'fundemental', 'disdain', 'pluralistic', 'america', 'reveals', 'chilling', 'level', 'islamophobia', 'hatemongering', 'church', 'plan', 'burn', 'quran', 'endanger', 'troop', 'u', 'commander', 'warns', 'freed', 'journalist', 'tricked', 'captor', 'twitter', 'access', 'manila', 'water', 'crisis', 'expose', 'impact', 'privatisation', 'july', 'week-long', 'rationing', 'water', 'highlighted', 'reality', 'million', 'people', 'denied', 'basic', 'right', 'potable', 'water', 'sanitation', 'private', 'firm', 'rake', 'profit', 'expense', 'weird', 'uk', 'police', 'ask', 'help', 'case', 'slain', 'intelligence', 'agent', 'greenpeace', 'japan', 'anti-whaling', 'activist', 'found', 'guilty', 'theft', 'captured', 'journalist', 'trick', 'captor', 'revealing', 'alive', 'creepy', 'biometric', 'id', 'forced', 'onto', 'india', 'billion', 'inhabitant', 'fear', 'loss', 'privacy', 'government', 'abuse', 'abound', 'india', 'gear', 'biometrically', 'identify', 'number', 'billion', 'inhabitant', 'china', 'young', 'officer', 'syndrome', 'china', 'military', 'spending', 'growing', 'fast', 'overtaken', 'strategy', 'said', 'professor', 'huang', 'jing', 'school', 'public', 'policy', 'young', 'officer', 'taking', 'control', 'strategy', 'like', 'young', 'officer', 'japan', 'mexican', 'soldier', 'open', 'fire', 'family', 'car', 'military', 'checkpoint', 'killing', 'father', 'son', 'death', 'toll', 'continues', 'climb', 'guatemala', 'landslide', 'foreign', 'power', 'stop', 'interfering', 'case', 'iranian', 'woman', 'sentenced', 'death', 'stoning', 'iran', 'foreign', 'ministry', 'said', 'mexican', 'official', 'gunman', 'behind', 'massacre', 'killed', 'tv', 'anchor', 'stabbed', 'death', 'outside', 'kabul', 'home', 'mosque', 'menace', 'confined', 'lower', 'manhattan', 'many', 'european', 'country', 'similar', 'alarm', 'sounded', 'muslim', 'coming', 'french', 'citizen', 'barred', 'american', 'military', 'base', 'dutch', 'neo-nazi', 'donates', 'sperm', 'white', 'dutch', 'neo-nazi', 'offered', 'donate', 'sperm', 'four', 'fertility', 'clinic', 'netherlands', 'effort', 'promote', 'call', 'strong', 'white', 'race']


from gensim.models.word2vec import Word2Vec

model = Word2Vec(corpus, size=128, window=5, min_count=5, workers=4)

array([-0.29960674,  0.03145241,  0.00570022,  0.09868251, -0.17285152,
        0.01856422, -0.01089751,  0.15153641,  0.17857222, -0.03622751,
        0.10417395, -0.0260475 ,  0.08195975, -0.06125315,  0.04687231,
        0.05752773,  0.12925589,  0.07877159, -0.13440445,  0.20191686,
        0.12656711, -0.06969397,  0.02447173, -0.02880211,  0.10401903,
        0.14345747, -0.07248937,  0.18151827,  0.04464363,  0.14008987,
        0.02799574,  0.1358372 ,  0.10382857,  0.11229188, -0.0558577 ,
        0.02732387,  0.0209927 , -0.09975895, -0.08367401, -0.05347675,
        0.0048474 ,  0.01783419,  0.13062523, -0.01942245, -0.18787207,
        0.24485843,  0.0890732 ,  0.15354921, -0.02848417, -0.17805465,
        0.12659959,  0.07361489,  0.11841691, -0.0817158 , -0.09146189,
       -0.15631667,  0.07889554,  0.06325027, -0.21279941,  0.22228   ,
       -0.11718205,  0.13774644,  0.15049173,  0.13688704,  0.33995184,
       -0.12521227,  0.01028001, -0.12642032,  0.07831606, -0.0252238 ,
       -0.03395513,  0.03965646,  0.22474508, -0.13310082,  0.13553855,
       -0.10668604,  0.22141342,  0.06522292,  0.14127599,  0.08240495,
        0.03247302,  0.30142626,  0.09993532, -0.18855172,  0.01956543,
        0.16998382, -0.155719  , -0.06757715,  0.17540725,  0.02754072,
        0.04505057, -0.06942102,  0.04041849,  0.33680534, -0.03090001,
       -0.08459242,  0.00468331, -0.08084729,  0.15038815,  0.2194476 ,
        0.10415938, -0.02096822,  0.26186588, -0.00954993,  0.1127312 ,
        0.14906277, -0.0927472 ,  0.19095857,  0.24790056, -0.003826  ,
        0.04918066, -0.02232081, -0.03569063, -0.17610529, -0.08925602,
       -0.06415266,  0.28868139,  0.01529911, -0.22414474, -0.15126266,
        0.24473965, -0.09966447,  0.22041951,  0.17169574,  0.12241554,
       -0.1190941 , -0.2071649 ,  0.04336704], dtype=float32)



# 先拿到全部的vocabulary
vocab = model.vocab

# 得到任意text的vector
def get_vector(word_list):
    # 建立一个全是0的array
    res =np.zeros([128])
    count = 0
    for word in word_list:
        if word in vocab:
            res += model[word]
            count += 1
    return res/count    
此时,我们得到了一个取得任意word list平均vector值得方法:

get_vector(['hello', 'from', 'the', 'other', 'side'])
array([-0.31350832,  0.04835839,  0.0048861 ,  0.11434336, -0.1799269 ,
        0.05557305, -0.02394118,  0.16106121,  0.1858674 , -0.04397187,
        0.10581181, -0.04210376,  0.1221713 , -0.05319506,  0.04339079,
        0.0631889 ,  0.1334364 ,  0.10450788, -0.13369248,  0.17621091,
        0.11581808, -0.07839958,  0.01126511, -0.03497357,  0.1140593 ,
        0.15078972, -0.06713609,  0.17556626,  0.04463732,  0.14599135,
        0.05664013,  0.14580157,  0.13244719,  0.10890759, -0.08809417,
        0.02204922,  0.02513832, -0.10544483, -0.09301682, -0.04607506,
       -0.0043104 ,  0.03133655,  0.13699191,  0.01113589, -0.19221411,
        0.23371264,  0.07863618,  0.16435402, -0.01011975, -0.18208385,
        0.1232647 ,  0.09053386,  0.11891054, -0.0958069 , -0.06351973,
       -0.13449311,  0.08414212,  0.08572642, -0.18276297,  0.2460763 ,
       -0.13310654,  0.1621725 ,  0.14568455,  0.16453338,  0.32360496,
       -0.16287505,  0.0061395 , -0.13277827,  0.06658031, -0.00849631,
       -0.04312018,  0.05207892,  0.23483992, -0.12247395,  0.14737971,
       -0.09834758,  0.23857855,  0.10417985,  0.19205472,  0.07291839,
        0.05432127,  0.31228056,  0.10667485, -0.19113681,  0.02985532,
        0.17966536, -0.17697723, -0.07287586,  0.17143352, -0.00642007,
        0.03691518, -0.06739308,  0.06709844,  0.3335989 , -0.00951616,
       -0.10647952,  0.02660648, -0.07802326,  0.15588878,  0.23749367,
        0.10878561,  0.01947832,  0.21858906,  0.00061314,  0.14626372,
        0.16053095, -0.11741858,  0.22829354,  0.21768039, -0.00993046,
        0.08663368, -0.03933012, -0.06405959, -0.19196833, -0.08962602,
       -0.09244292,  0.31830364, -0.00367699, -0.220584  , -0.13564284,
        0.23458903, -0.12524679,  0.21924314,  0.1900594 ,  0.11750702,
       -0.10788013, -0.23775842,  0.04254359])
这样,我们可以同步把我们的X都给转化成128维的一个vector list


wordlist_train = X_train
wordlist_test = X_test

X_train = [get_vector(x) for x in X_train]
X_test = [get_vector(x) for x in X_test]

[-0.49616703  0.09831359  0.00215805  0.21753911 -0.24626317  0.06163961
 -0.01088745  0.24991728  0.23802179 -0.06698225  0.20064797 -0.06599116
  0.19794046 -0.07558411  0.07941745  0.06008136  0.21772295  0.1405834
 -0.18259355  0.31937215  0.17264133 -0.15873611  0.07915295 -0.04948181
  0.12474476  0.228815   -0.0983179   0.28874519  0.03865414  0.23373656
  0.02709786  0.26116451  0.17697223  0.19303173 -0.07914471  0.0832512
  0.03469482 -0.15666168 -0.07268126 -0.11324668 -0.01033463  0.05544584
  0.18356295 -0.01587121 -0.29344295  0.47934875  0.09503752  0.19703337
 -0.00700106 -0.28184425  0.23042896  0.11191312  0.18606185 -0.08041457
 -0.14015471 -0.24658055  0.11680572  0.07711736 -0.33447557  0.35767368
 -0.17762948  0.26263842  0.23744292  0.23772386  0.49314497 -0.19214054
 -0.0103213  -0.20215777  0.07296897 -0.02735564 -0.07290107  0.07402693
  0.38805058 -0.21641682  0.2155705  -0.12779231  0.35236881  0.13735026
  0.26753956  0.14389433  0.06487844  0.48379081  0.16635294 -0.29834936
  0.04311308  0.29604598 -0.19924251 -0.1253365   0.32512026  0.01877471
  0.08315832 -0.08786424  0.07361021  0.53263441  0.02459062 -0.15883806
  0.01811006 -0.13394059  0.2364613   0.32745286  0.19984239 -0.0058238
  0.4130943  -0.00106707  0.17232155  0.28846937 -0.15492516  0.33995942
  0.37123214 -0.04274277  0.11453501 -0.0671717  -0.06910405 -0.27783239
 -0.14075597 -0.12174125  0.51931463 -0.03699218 -0.40793285 -0.23604008
  0.36195668 -0.20882112  0.377449    0.29138071  0.18641824 -0.16645377
 -0.40529569  0.00538284]


from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score

params = [0.1,0.5,1,3,5,7,10,12,16,20,25,30,35,40]
test_scores = []
for param in params:
    clf = SVR(gamma=param)
    test_score = cross_val_score(clf, X_train, y_train, cv=3, scoring='roc_auc')
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(params, test_scores)
plt.title("Param vs CV AUC Score");







什么是padding size?


# 说明,对于每天的新闻,我们会考虑前256个单词。不够的我们用[000000]补上
# vec_size 指的是我们本身vector的size
def transform_to_matrix(x, padding_size=256, vec_size=128):
    res = []
    for sen in x:
        matrix = []
        for i in range(padding_size):
                # 这里有两种except情况,
                # 1. 这个单词找不到
                # 2. sen没那么长
                # 不管哪种情况,我们直接贴上全是0的vec
                matrix.append([0] * vec_size)
    return res
#这时候,我们把我们原本的word list跑一遍:

X_train = transform_to_matrix(wordlist_train)
X_test = transform_to_matrix(wordlist_test)

可以看到,现在我们得到的就是一个大大的Matrix,它的size是 128 * 256



原因是我们要让每一个matrix外部“包裹”一层维度。来告诉我们的CNN model,我们的每个数据点都是独立的。之间木有前后关系。

# 搞成np的数组,便于处理
X_train = np.array(X_train)
X_test = np.array(X_test)

# 看看数组的大小
(1611, 256, 128) # 训练集,每个句子单词数,每个单词向量长度
(378, 256, 128)
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1], X_train.shape[2]) # 注意1的位置,前后语境长度为1
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1], X_test.shape[2])

(1611, 1, 256, 128)
(378, 1, 256, 128)

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Convolution2D, MaxPooling2D
from keras.layers.core import Dense, Dropout, Activation, Flatten

# set parameters:
batch_size = 32
n_filter = 16
filter_length = 4
nb_epoch = 5
n_pool = 2

# 新建一个sequential的模型
model = Sequential()
model.add(Convolution2D(n_filter,filter_length,filter_length, #2维CNN,
                        input_shape=(1, 256, 128)))
model.add(MaxPooling2D(pool_size=(n_pool, n_pool))) #池化层,取最大值或者平均值
model.add(Flatten()) # 需要把二维变成一维
# 后面接上一个ANN
model.add(Dense(1)) # 输出层要是1
# compile模型
Using Theano backend.

model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch,
score = model.evaluate(X_test, y_test, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])
Test score: 0.492063492221
Test accuracy: 0.507936509829









import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from datetime import date

data = pd.read_csv('../input/Combined_News_DJIA.csv')

Date	Label	Top1	Top2	Top3	Top4	Top5	Top6	Top7	Top8	...	Top16	Top17	Top18	Top19	Top20	Top21	Top22	Top23	Top24	Top25
0	2008-08-08	0	b"Georgia 'downs two Russian warplanes' as cou...	b'BREAKING: Musharraf to be impeached.'	b'Russia Today: Columns of troops roll into So...	b'Russian tanks are moving towards the capital...	b"Afghan children raped with 'impunity,' U.N. ...	b'150 Russian tanks have entered South Ossetia...	b"Breaking: Georgia invades South Ossetia, Rus...	b"The 'enemy combatent' trials are nothing but...	...	b'Georgia Invades South Ossetia - if Russia ge...	b'Al-Qaeda Faces Islamist Backlash'	b'Condoleezza Rice: "The US would not act to p...	b'This is a busy day: The European Union has ...	b"Georgia will withdraw 1,000 soldiers from Ir...	b'Why the Pentagon Thinks Attacking Iran is a ...	b'Caucasus in crisis: Georgia invades South Os...	b'Indian shoe manufactory - And again in a se...	b'Visitors Suffering from Mental Illnesses Ban...	b"No Help for Mexico's Kidnapping Surge"
1	2008-08-11	1	b'Why wont America and Nato help us? If they w...	b'Bush puts foot down on Georgian conflict'	b"Jewish Georgian minister: Thanks to Israeli ...	b'Georgian army flees in disarray as Russians ...	b"Olympic opening ceremony fireworks 'faked'"	b'What were the Mossad with fraudulent New Zea...	b'Russia angered by Israeli military sale to G...	b'An American citizen living in S.Ossetia blam...	...	b'Israel and the US behind the Georgian aggres...	b'"Do not believe TV, neither Russian nor Geor...	b'Riots are still going on in Montreal (Canada...	b'China to overtake US as largest manufacturer'	b'War in South Ossetia [PICS]'	b'Israeli Physicians Group Condemns State Tort...	b' Russia has just beaten the United States ov...	b'Perhaps *the* question about the Georgia - R...	b'Russia is so much better at war'	b"So this is what it's come to: trading sex fo...
2	2008-08-12	0	b'Remember that adorable 9-year-old who sang a...	b"Russia 'ends Georgia operation'"	b'"If we had no sexual harassment we would hav...	b"Al-Qa'eda is losing support in Iraq because ...	b'Ceasefire in Georgia: Putin Outmaneuvers the...	b'Why Microsoft and Intel tried to kill the XO...	b'Stratfor: The Russo-Georgian War and the Bal...	b"I'm Trying to Get a Sense of This Whole Geor...	...	b'U.S. troops still in Georgia (did you know t...	b'Why Russias response to Georgia was right'	b'Gorbachev accuses U.S. of making a "serious ...	b'Russia, Georgia, and NATO: Cold War Two'	b'Remember that adorable 62-year-old who led y...	b'War in Georgia: The Israeli connection'	b'All signs point to the US encouraging Georgi...	b'Christopher King argues that the US and NATO...	b'America: The New Mexico?'	b"BBC NEWS | Asia-Pacific | Extinction 'by man...
3	2008-08-13	0	b' U.S. refuses Israel weapons to attack Iran:...	b"When the president ordered to attack Tskhinv...	b' Israel clears troops who killed Reuters cam...	b'Britain\'s policy of being tough on drugs is...	b'Body of 14 year old found in trunk; Latest (...	b'China has moved 10 *million* quake survivors...	b"Bush announces Operation Get All Up In Russi...	b'Russian forces sink Georgian ships '	...	b'Elephants extinct by 2020?'	b'US humanitarian missions soon in Georgia - i...	b"Georgia's DDOS came from US sources"	b'Russian convoy heads into Georgia, violating...	b'Israeli defence minister: US against strike ...	b'Gorbachev: We Had No Choice'	b'Witness: Russian forces head towards Tbilisi...	b' Quarter of Russians blame U.S. for conflict...	b'Georgian president says US military will ta...	b'2006: Nobel laureate Aleksander Solzhenitsyn...
4	2008-08-14	1	b'All the experts admit that we should legalis...	b'War in South Osetia - 89 pictures made by a ...	b'Swedish wrestler Ara Abrahamian throws away ...	b'Russia exaggerated the death toll in South O...	b'Missile That Killed 9 Inside Pakistan May Ha...	b"Rushdie Condemns Random House's Refusal to P...	b'Poland and US agree to missle defense deal. ...	b'Will the Russians conquer Tblisi? Bet on it,...	...	b'Bank analyst forecast Georgian crisis 2 days...	b"Georgia confict could set back Russia's US r...	b'War in the Caucasus is as much the product o...	b'"Non-media" photos of South Ossetia/Georgia ...	b'Georgian TV reporter shot by Russian sniper ...	b'Saudi Arabia: Mother moves to block child ma...	b'Taliban wages war on humanitarian aid workers'	b'Russia: World "can forget about" Georgia\'s...	b'Darfur rebels accuse Sudan of mounting major...	b'Philippines : Peace Advocate say Muslims nee...
5 rows × 27 columns

#这下,我们可以先把数据给分成Training/Testing data

train = data[data['Date'] < '2015-01-01']
test = data[data['Date'] > '2014-12-31']

X_train = train[train.columns[2:]]
corpus = X_train.values.flatten().astype(str)

X_train = X_train.values.astype(str)
X_train = np.array([' '.join(x) for x in X_train])
X_test = test[test.columns[2:]]
X_test = X_test.values.astype(str)
X_test = np.array([' '.join(x) for x in X_test])
y_train = train['Label'].values
y_test = test['Label'].values

#corpus是全部我们『可见』的文本资料。我们假设每条新闻就是一句话,把他们全部flatten()了,我们就会得到list of sentences。


array([ 'b"Georgia \'downs two Russian warplanes\' as countries move to brink of war"',
       "b'BREAKING: Musharraf to be impeached.'",
       "b'Russia Today: Columns of troops roll into South Ossetia; footage from fighting (YouTube)'"], 
array([ 'b"Georgia \'downs two Russian warplanes\' as countries move to brink of war" b\'BREAKING: Musharraf to be impeached.\' b\'Russia Today: Columns of troops roll into South Ossetia; footage from fighting (YouTube)\' b\'Russian tanks are moving towards the capital of South Ossetia, which has reportedly been completely destroyed by Georgian artillery fire\' b"Afghan children raped with \'impunity,\' U.N. official says - this is sick, a three year old was raped and they do nothing" b\'150 Russian tanks have entered South Ossetia whilst Georgia shoots down two Russian jets.\' b"Breaking: Georgia invades South Ossetia, Russia warned it would intervene on SO\'s side" b"The \'enemy combatent\' trials are nothing but a sham: Salim Haman has been sentenced to 5 1/2 years, but will be kept longer anyway just because they feel like it." b\'Georgian troops retreat from S. Osettain capital, presumably leaving several hundred people killed. [VIDEO]\' b\'Did the U.S. Prep Georgia for War with Russia?\' b\'Rice Gives Green Light for Israel to Attack Iran: Says U.S. has no veto over Israeli military ops\' b\'Announcing:Class Action Lawsuit on Behalf of American Public Against the FBI\' b"So---Russia and Georgia are at war and the NYT\'s top story is opening ceremonies of the Olympics?  What a fucking disgrace and yet further proof of the decline of journalism." b"China tells Bush to stay out of other countries\' affairs" b\'Did World War III start today?\' b\'Georgia Invades South Ossetia - if Russia gets involved, will NATO absorb Georgia and unleash a full scale war?\' b\'Al-Qaeda Faces Islamist Backlash\' b\'Condoleezza Rice: "The US would not act to prevent an Israeli strike on Iran." Israeli Defense Minister Ehud Barak: "Israel is prepared for uncompromising victory in the case of military hostilities."\' b\'This is a busy day:  The European Union has approved new sanctions against Iran in protest at its nuclear programme.\' b"Georgia will withdraw 1,000 soldiers from Iraq to help fight off Russian forces in Georgia\'s breakaway region of South Ossetia" b\'Why the Pentagon Thinks Attacking Iran is a Bad Idea - US News & World Report\' b\'Caucasus in crisis: Georgia invades South Ossetia\' b\'Indian shoe manufactory  - And again in a series of "you do not like your work?"\' b\'Visitors Suffering from Mental Illnesses Banned from Olympics\' b"No Help for Mexico\'s Kidnapping Surge"'], 
array([0, 1, 0, 0, 1])


from nltk.tokenize import word_tokenize

corpus = [word_tokenize(x) for x in corpus]
X_train = [word_tokenize(x) for x in X_train]
X_test = [word_tokenize(x) for x in X_test]




 ["b'BREAKING", ':', 'Musharraf', 'to', 'be', 'impeached', '.', "'"]]






# 停止词
from nltk.corpus import stopwords
stop = stopwords.words('english')

# 数字
import re
def hasNumbers(inputString):
    return bool(re.search(r'\d', inputString))

# 特殊符号
def isSymbol(inputString):
    return bool(re.match(r'[^\w]', inputString))

# lemma
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def check(word):
    word= word.lower()
    if word in stop:
        return False
    elif hasNumbers(word) or isSymbol(word):
        return False
        return True

# 把上面的方法综合起来
def preprocessing(sen):
    res = []
    for word in sen:
        if check(word):
            # 这一段的用处仅仅是去除python里面byte存str时候留下的标识。。之前数据没处理好,其他case里不会有这个情况
            word = word.lower().replace("b'", '').replace('b"', '').replace('"', '').replace("'", '')
    return res

corpus = [preprocessing(x) for x in corpus]
X_train = [preprocessing(x) for x in X_train]
X_test = [preprocessing(x) for x in X_test]

['north', 'korean', 'leader', 'kim', 'jong-il', 'confirmed', 'ill']
['two', 'redditors', 'climbing', 'mt', 'kilimanjaro', 'charity', 'bidding', 'peak', 'nt', 'squander', 'opportunity', 'let', 'upvotes', 'something', 'awesome', 'estimated', 'take', 'year', 'clear', 'lao', 'explosive', 'remnant', 'left', 'behind', 'united', 'state', 'bomber', 'year', 'ago', 'people', 'died', 'unexploded', 'ordnance', 'since', 'conflict', 'ended', 'fidel', 'ahmadinejad', 'slandering', 'jew', 'mossad', 'america', 'israel', 'intelligence', 'agency', 'target', 'united', 'state', 'intensively', 'among', 'nation', 'considered', 'friendly', 'washington', 'israel', 'lead', 'others', 'active', 'espionage', 'directed', 'american', 'company', 'defense', 'department', 'australian', 'election', 'day', 'poll', 'rural/regional', 'independent', 'member', 'parliament', 'support', 'labor', 'minority', 'goverment', 'julia', 'gillard', 'prime', 'minister', 'france', 'plan', 'raise', 'retirement', 'age', 'set', 'strike', 'britain', 'parliament', 'police', 'murdoch', 'paper', 'adviser', 'pm', 'implicated', 'voicemail', 'hacking', 'scandal', 'british', 'policeman', 'jailed', 'month', 'cell', 'attack', 'woman', 'rest', 'email', 'display', 'fundemental', 'disdain', 'pluralistic', 'america', 'reveals', 'chilling', 'level', 'islamophobia', 'hatemongering', 'church', 'plan', 'burn', 'quran', 'endanger', 'troop', 'u', 'commander', 'warns', 'freed', 'journalist', 'tricked', 'captor', 'twitter', 'access', 'manila', 'water', 'crisis', 'expose', 'impact', 'privatisation', 'july', 'week-long', 'rationing', 'water', 'highlighted', 'reality', 'million', 'people', 'denied', 'basic', 'right', 'potable', 'water', 'sanitation', 'private', 'firm', 'rake', 'profit', 'expense', 'weird', 'uk', 'police', 'ask', 'help', 'case', 'slain', 'intelligence', 'agent', 'greenpeace', 'japan', 'anti-whaling', 'activist', 'found', 'guilty', 'theft', 'captured', 'journalist', 'trick', 'captor', 'revealing', 'alive', 'creepy', 'biometric', 'id', 'forced', 'onto', 'india', 'billion', 'inhabitant', 'fear', 'loss', 'privacy', 'government', 'abuse', 'abound', 'india', 'gear', 'biometrically', 'identify', 'number', 'billion', 'inhabitant', 'china', 'young', 'officer', 'syndrome', 'china', 'military', 'spending', 'growing', 'fast', 'overtaken', 'strategy', 'said', 'professor', 'huang', 'jing', 'school', 'public', 'policy', 'young', 'officer', 'taking', 'control', 'strategy', 'like', 'young', 'officer', 'japan', 'mexican', 'soldier', 'open', 'fire', 'family', 'car', 'military', 'checkpoint', 'killing', 'father', 'son', 'death', 'toll', 'continues', 'climb', 'guatemala', 'landslide', 'foreign', 'power', 'stop', 'interfering', 'case', 'iranian', 'woman', 'sentenced', 'death', 'stoning', 'iran', 'foreign', 'ministry', 'said', 'mexican', 'official', 'gunman', 'behind', 'massacre', 'killed', 'tv', 'anchor', 'stabbed', 'death', 'outside', 'kabul', 'home', 'mosque', 'menace', 'confined', 'lower', 'manhattan', 'many', 'european', 'country', 'similar', 'alarm', 'sounded', 'muslim', 'coming', 'french', 'citizen', 'barred', 'american', 'military', 'base', 'dutch', 'neo-nazi', 'donates', 'sperm', 'white', 'dutch', 'neo-nazi', 'offered', 'donate', 'sperm', 'four', 'fertility', 'clinic', 'netherlands', 'effort', 'promote', 'call', 'strong', 'white', 'race']






所以,我们可以用一个python wrapper来造个interface,方便我们调用。



for i in range(len(y_train)):
    label = '__label__' + str(y_train[i])

['the', 'man', 'podium', 'dutch', 'non-profit', 'reproductive', 'health', 'organization', 'sail', 'ship', 'around', 'world', 'anchoring', 'international', 'water', 'provide', 'abortion', 'woman', 'country', 'abortion', 'banned', 'b', 'grand', 'ayatollah', 'issue', 'decree', 'calling', 'muslim', 'defend', 'iraq', 'christian', 'marx', 'da', 'kapital', 'sale', 'soar', 'among', 'young', 'german', 'a', 'man', 'england', 'killed', 'wife', 'changed', 'facebook', 'relationship', 'status', 'single', 'georgia', 'used', 'cluster', 'bomb', 'august', 'war', 'arctic', 'temperature', 'break', 'all-time', 'recorded', 'high', 'reddit', 'please', 'send', 'help', 'uk', 'politician', 'insane', 'apparently', 'monitoring', 'mobile', 'web', 'record', 'would', 'giving', 'licence', 'terrorist', 'kill', 'people', 'wow', 'secret', 'coded', 'message', 'embedded', 'child', 'pornographic', 'image', 'paedophile', 'website', 'exploited', 'secure', 'way', 'passing', 'information', 'terrorist', 'england', 'run', 'honey', 'christmas', 'catastrophic', 'honeybee', 'decline', 'b', 'iran', 'stop', 'executing', 'youth', 'china', 'watch', 'internet', 'caf', 'customer', 'web', 'crackdown', 'china\\', 'medium', 'freedom', 'reduced', 'new', 'measure', 'include', 'camera', 'internet', 'cafe', 'picture', 'taken', 'user', 'bali', 'bombing', 'new', 'suspect', 'hindu', 'american', 'foundation', 'petition', 'ny', 'time', 'focus', 'much', 'activity', 'christian', 'missionary', 'india', 'anti-christian', 'violence', 'a', 'quick', 'overview', 'islamic', 'terror', 'organization', 'get', 'funding', 'last', 'titantic', 'survivor', 'auction', 'memento', 'pay', 'nursing', 'home', 'better', 'hungary', 'get', 'loan', 'avert', 'meltdown', 'sao', 'paolo', 'hundred', 'black-clad', 'military', 'police', 'fired', 'teargas', 'stun', 'grenade', 'rubber', 'bullet', 'striking', 'civilian', 'officer', 'seeking', 'percent', 'pay', 'raise', 'austrailian', 'historian', 'arrested', 'holocaust', 'denial', 'defense', 'secretary', 'gate', 'said', 'prepared', 'reconciliation', 'taliban', 'part', 'political', 'outcome', 'afghanistan', 'is', 'switzerland', 'next', 'iceland', 'switzerland', 'forced', 'take', 'emergency', 'measure', 'yesterday', 'shore', 'two', 'biggest', 'lender', 'prevent', 'collapse', 'confidence', 'country\\', 'banking', 'system', 'police', 'battle', 'police', 'sao', 'paulo', 'civilian', 'killed', 'nato', 'air', 'strike', 'afghanistan', 'villager', 'the', 'west', 'loss', 'afghanistan', '__label__0']





X_train = [' '.join(x) for x in X_train]

north korea halt denuclearisation u fails remove list state sponsoring terrorism child among dead u airstrike afghanistan the russian parliament voted overwhelmingly officially recognize independence abkhazia south ossetia violent animal right activist set fire scientist home little protection available scientist nbc censored olympic champion matthew mitcham gay un say convincing evidence show u airstrike afghanistan killed people including child italy try outlaw islam mystery virus kill israeli group peace say settlement construction occupied west bank nearly doubled since last year b revealed britain secret propaganda war al-qaida b israel settlement surge draw rice criticism solar powered carbon neutral pyramid house million people dubai russia claim proof genocide how nato transformed military alliance quasi-united nation cartwheeling banned school philly-area activist released china jeff said slapped around threatend saying want head cut want shot b vatican describes hindu attack christian orphanage god protester tell tale beijing detention- sleep deprivation threat oh python kill zookeeper kelly murdered say uk intelligence insider b fury image myra hindley appears british film olympics party b north korea suspend nuclear disablement german suspect bayer pesticide beehive collapse research terrorism invaluable fear arrest top u diplomat escape gun attack pakistan __label__1
X_test = [' '.join(x) for x in X_test]

with open('../input/train_ft.txt', 'w') as f:
    for sen in X_train:

with open('../input/test_ft.txt', 'w') as f:
    for sen in X_test:

with open('../input/test_label_ft.txt', 'w') as f:
    for label in y_test:
import fasttext

clf = fasttext.supervised('../input/train_ft.txt', 'model', dim=256, ws=5, neg=5, epoch=100, min_count=10, lr=0.1, lr_update_rate=1000, bucket=200000)

y_scores = []

# 我们用predict来给出判断
labels = clf.predict(X_test)

y_preds = np.array(labels).flatten().astype(int)

# 我们来看看

from sklearn import metrics

# 算个AUC准确率
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_preds, pos_label=1)
print(metrics.auc(fpr, tpr))
[1 0 0 1 1 0 0 0 0 0 1 1 1 1 0 1 0 0 1 0 1 1 1 1 0 0 1 0 1 1 1 0 0 1 0 1 1
 0 0 1 0 0 1 0 1 0 0 1 0 1 0 1 0 1 0 0 0 0 1 1 0 0 1 1 0 1 1 1 0 1 1 0 0 1
 0 1 1 1 0 1 0 0 1 1 0 0 1 1 0 0 0 1 1 1 1 0 1 0 0 1 0 0 1 0 1 0 0 0 0 1 1
 0 0 1 1 1 0 1 1 0 0 1 0 1 1 0 0 1 0 1 1 1 1 0 1 0 1 0 0 0 0 0 1 1 0 0 0 0
 0 0 0 1 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 1 1 0 1 0 1 1 0 1 1 0 0 1 0 0 0 1
 0 1 1 0 1 1 1 1 1 1 1 0 0 1 1 1 0 0 1 1 0 0 1 0 0 1 1 0 0 1 0 1 0 0 0 1 1
 1 0 1 0 1 1 0 0 1 0 0 1 0 0 0 1 0 1 1 1 0 0 1 1 1 0 0 1 0 0 0 1 0 0 0 1 1
 0 1 0 1 0 1 1 0 1 0 1 1 0 0 1 1 0 0 0 0 0 1 1 1 0 0 1 0 1 1 0 0 1 1 1 1 1
 0 1 0 1 1 1 1 1 1 1 0 0 1 1 1 1 0 1 0 0 1 0 1 0 1 1 1 0 1 1 1 0 1 0 1 1 0
 0 1 0 0 1 1 0 1 0 1 0 1 0 0 0 1 0 1 1 0 1 0 1 1 0 1 1 1 0 0 0 0 0 1 0 1 1
 0 1 0 0 1 1 1 1]
[0 1 0 1 1 1 1 1 0 1 0 0 1 1 0 1 1 1 0 1 1 0 0 1 1 0 1 1 1 0 1 1 0 0 0 1 1
 1 1 1 0 1 1 0 0 1 1 0 1 0 1 0 1 1 1 0 1 1 1 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0
 1 1 0 1 0 0 1 1 0 0 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 1 0 0 1 1 1 1 0 1 0 1
 1 1 0 0 1 1 1 1 1 0 0 1 0 0 1 1 0 1 0 0 1 0 1 0 1 1 0 0 0 1 0 0 1 1 1 1 1
 0 0 1 1 0 0 0 1 1 0 1 1 1 0 1 0 1 0 0 1 1 0 0 1 1 0 0 1 1 1 0 1 1 1 0 1 1
 1 1 1 1 0 1 0 0 0 1 0 1 1 1 0 1 1 1 1 1 1 0 0 1 1 1 0 1 1 1 0 0 1 0 0 1 0
 1 0 1 0 0 0 1 1 0 0 1 1 1 1 0 1 1 1 1 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0 1 1 1
 1 1 0 0 0 1 1 0 1 1 1 0 1 0 1 1 0 0 0 1 1 1 0 1 1 0 1 1 0 0 0 0 0 0 0 1 1
 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 0 1 1 1 1 0
 1 1 1 1 1 0 1 1 1 0 0 0 1 1 1 1 0 0 0 1 0 0 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0
 1 1 1 0 1 1 0 1]
同理,这里,我们通过parameter tuning或者是resampling,可以让我们的结果更加好。




