为了记录kaggle学习心得。
参考了大神文章。
1.http://www.cnblogs.com/lijingpeng/p/5787549.html
2.python机器学习及实战
from sklearn.datasets import fetch_20newsgroups
X, y = news.data , news.target
查看X的长度 , 以及X[0]的长度
print(len(X) ,len(X[0]),len(X[0][0]))
from bs4 import BeautifulSoup
import nltk ,re
news = fetch_20newsgroups(subset='all')
def news_to_sentences(news):
news_text = BeautifulSoup(news).get_text()
# 去掉HTML标签,拿到内容
sentences = []
for x in X:
sentences += news_to_sentences(x)
from gensim.models import word2vec
num_features = 300
min_word_count = 20
num_workers = 2
context = 5
downsampling = 1e-3
from gensim.models import word2vec
model = word2vec.Word2Vec(sentences, workers=num_workers, \
size=num_features, min_count = min_word_count, \
window = context, sample = downsampling)
model.init_sims(replace=True)
model.most_similar('morning')
from sklearn.datasets import fetch_20newsgroups
X, y = news.data , news.target
查看X的长度 , 以及X[0]的长度
print(len(X) ,len(X[0]),len(X[0][0]))
from bs4 import BeautifulSoup
import nltk ,re
news = fetch_20newsgroups(subset='all')
def news_to_sentences(news):
news_text = BeautifulSoup(news).get_text()
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
raw_sentences = tokenizer.tokenize(news_text)
sentences = []
for sent in raw_sentences:
sentences.append(re.sub('[^a-zA-Z]', ' ', sent.lower().strip()).split())
return sentences
sentences = []
for x in X:
sentences += news_to_sentences(x)
from gensim.models import word2vec
num_features = 300
min_word_count = 20
num_workers = 2
context = 5
downsampling = 1e-3
from gensim.models import word2vec
model = word2vec.Word2Vec(sentences, workers=num_workers, \
size=num_features, min_count = min_word_count, \
window = context, sample = downsampling)
model.init_sims(replace=True)
model.most_similar('morning')