第一章
import timeit
normal_py_sec = timeit.timeit('sum(x*x for x in xrange(1000))',
number=10000)
naive_np_sec = timeit.timeit('sum(na*na)',
setup="import numpy as np; na=np.arange(1000)",
number=10000)
good_np_sec = timeit.timeit('na.dot(na)',
setup="import numpy as np; na=np.arange(1000)",
number=10000)
print("Normal Python: %f sec" % normal_py_sec)
print("Naive NumPy: %f sec" % naive_np_sec)
print("Good NumPy: %f sec" % good_np_sec)
# This script generates web traffic data for our hypothetical
# web startup "MLASS" in chapter 01
import os
import scipy as sp
from scipy.stats import gamma
import matplotlib.pyplot as plt
sp.random.seed(3) # to reproduce the data later on
x = sp.arange(1, 31 * 24)
y = sp.array(200 * (sp.sin(2 * sp.pi * x / (7 * 24))), dtype=int)
y += gamma.rvs(15, loc=0, scale=100, size=len(x)) #产生gamma分布
y += 2 * sp.exp(x / 100.0)
y = sp.ma.array(y, mask=[y < 0]) ###此处使用了掩码数组 mask,
print(sum(y), sum(y < 0))
plt.scatter(x, y)
plt.title("Web traffic over the last month")
plt.xlabel("Time")
plt.ylabel("Hits/hour")
plt.xticks([w * 7 * 24 for w in [0, 1, 2, 3, 4]], ['week %i' % (w + 1) for w in [
0, 1, 2, 3, 4]])
plt.autoscale(tight=True)
plt.grid()
plt.savefig(os.path.join("..", "1400_01_01.png"))
data_dir = os.path.join(
os.path.dirname(os.path.realpath(__file__)), "..", "data") ##注意路径的用法,os.path.join, os.path.dirname, os.path.realpath(__file__)
# sp.savetxt(os.path.join("..", "web_traffic.tsv"),
# zip(x[~y.mask],y[~y.mask]), delimiter="\t", fmt="%i")
sp.savetxt(os.path.join(
data_dir, "web_traffic.tsv"), list(zip(x, y)), delimiter="\t", fmt="%s")
import os
import scipy as sp
import matplotlib.pyplot as plt
data_dir = os.path.join(
os.path.dirname(os.path.realpath(__file__)), "..", "data")
data = sp.genfromtxt(os.path.join(data_dir, "web_traffic.tsv"), delimiter="\t") ##在scipy中打开文件
print(data[:10])
# all examples will have three classes in this file
colors = ['g', 'k', 'b', 'm', 'r']
linestyles = ['-', '-.', '--', ':', '-']
x = data[:, 0]
y = data[:, 1]
print("Number of invalid entries:", sp.sum(sp.isnan(y)))
x = x[~sp.isnan(y)] ##取反的方式
y = y[~sp.isnan(y)]
# plot input data
def plot_models(x, y, models, fname, mx=None, ymax=None, xmin=None):
plt.clf()
plt.scatter(x, y, s=10)
plt.title("Web traffic over the last month")
plt.xlabel("Time")
plt.ylabel("Hits/hour")
plt.xticks(
[w * 7 * 24 for w in range(10)], ['week %i' % w for w in range(10)])
if models:
if mx is None:
mx = sp.linspace(0, x[-1], 1000)
for model, style, color in zip(models, linestyles, colors): ##循环打印图标
# print "Model:",model
# print "Coeffs:",model.coeffs
plt.plot(mx, model(mx), linestyle=style, linewidth=2, c=color)
plt.legend(["d=%i" % m.order for m in models], loc="upper left")
plt.autoscale(tight=True)
plt.ylim(ymin=0)
if ymax:
plt.ylim(ymax=ymax)
if xmin:
plt.xlim(xmin=xmin)
plt.grid(True, linestyle='-', color='0.75')
plt.savefig(fname) ##保存图表
# first look at the data
plot_models(x, y, None, os.path.join("..", "1400_01_01.png"))
# create and plot models
fp1, res, rank, sv, rcond = sp.polyfit(x, y, 1, full=True)
print("Model parameters: %s" % fp1)
print("Error of the model:", res)
f1 = sp.poly1d(fp1)
f2 = sp.poly1d(sp.polyfit(x, y, 2)) ##ploy1d 多项式拟合函数
f3 = sp.poly1d(sp.polyfit(x, y, 3))
f10 = sp.poly1d(sp.polyfit(x, y, 10))
f100 = sp.poly1d(sp.polyfit(x, y, 100))
plot_models(x, y, [f1], os.path.join("..", "1400_01_02.png"))
plot_models(x, y, [f1, f2], os.path.join("..", "1400_01_03.png"))
plot_models(
x, y, [f1, f2, f3, f10, f100], os.path.join("..", "1400_01_04.png"))
# fit and plot a model using the knowledge about inflection point
inflection = 3.5 * 7 * 24
xa = x[:inflection]
ya = y[:inflection]
xb = x[inflection:]
yb = y[inflection:]
fa = sp.poly1d(sp.polyfit(xa, ya, 1))
fb = sp.poly1d(sp.polyfit(xb, yb, 1))
plot_models(x, y, [fa, fb], os.path.join("..", "1400_01_05.png"))
def error(f, x, y):
return sp.sum((f(x) - y) ** 2)
print("Errors for the complete data set:")
for f in [f1, f2, f3, f10, f100]:
print("Error d=%i: %f" % (f.order, error(f, x, y)))
print("Errors for only the time after inflection point")
for f in [f1, f2, f3, f10, f100]:
print("Error d=%i: %f" % (f.order, error(f, xb, yb)))
print("Error inflection=%f" % (error(fa, xa, ya) + error(fb, xb, yb)))
# extrapolating into the future
plot_models(
x, y, [f1, f2, f3, f10, f100], os.path.join("..", "1400_01_06.png"),
mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100),
ymax=10000, xmin=0 * 7 * 24)
print("Trained only on data after inflection point")
fb1 = fb
fb2 = sp.poly1d(sp.polyfit(xb, yb, 2))
fb3 = sp.poly1d(sp.polyfit(xb, yb, 3))
fb10 = sp.poly1d(sp.polyfit(xb, yb, 10))
fb100 = sp.poly1d(sp.polyfit(xb, yb, 100))
print("Errors for only the time after inflection point")
for f in [fb1, fb2, fb3, fb10, fb100]:
print("Error d=%i: %f" % (f.order, error(f, xb, yb)))
plot_models(
x, y, [fb1, fb2, fb3, fb10, fb100], os.path.join("..", "1400_01_07.png"),
mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100),
ymax=10000, xmin=0 * 7 * 24)
# separating training from testing data
frac = 0.3
split_idx = int(frac * len(xb))
shuffled = sp.random.permutation(list(range(len(xb))))
test = sorted(shuffled[:split_idx])
train = sorted(shuffled[split_idx:])
fbt1 = sp.poly1d(sp.polyfit(xb[train], yb[train], 1))
fbt2 = sp.poly1d(sp.polyfit(xb[train], yb[train], 2))
fbt3 = sp.poly1d(sp.polyfit(xb[train], yb[train], 3))
fbt10 = sp.poly1d(sp.polyfit(xb[train], yb[train], 10))
fbt100 = sp.poly1d(sp.polyfit(xb[train], yb[train], 100))
print("Test errors for only the time after inflection point")
for f in [fbt1, fbt2, fbt3, fbt10, fbt100]:
print("Error d=%i: %f" % (f.order, error(f, xb[test], yb[test])))
plot_models(
x, y, [fbt1, fbt2, fbt3, fbt10, fbt100], os.path.join("..",
"1400_01_08.png"),
mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100),
ymax=10000, xmin=0 * 7 * 24)
from scipy.optimize import fsolve
print(fbt2)
print(fbt2 - 100000)
reached_max = fsolve(fbt2 - 100000, 800) / (7 * 24)
print("100,000 hits/hour expected at week %f" % reached_max[0])
第二章
import numpy as np
from sklearn.datasets import load_iris
from matplotlib import pyplot as plt
data = load_iris()
features = data['data']
feature_names = data['feature_names']
target = data['target']
pairs = [(0,1),(0,2),(0,3),(1,2),(1,3),(2,3)]
for i,(p0,p1) in enumerate(pairs): # 注意次数enumerate的用法,i,(p0,p1)
plt.subplot(2,3,i+1)
for t,marker,c in zip(range(3),">ox","rgb"):
plt.scatter(features[target == t,p0], features[target == t,p1], marker=marker, c=c)
plt.xlabel(feature_names[p0])
plt.ylabel(feature_names[p1])
plt.xticks([])
plt.yticks([])
plt.savefig('../1400_02_01.png')
from load import load_dataset
def test_iris():
features, labels = load_dataset('iris')
assert len(features[0]) == 4 #断言
assert len(features)
assert len(features) == len(labels)
def test_seeds():
features, labels = load_dataset('seeds')
assert len(features[0]) == 7
assert len(features)
assert len(features) == len(labels)
import milksets.iris
import milksets.seeds
def save_as_tsv(fname, module):
features, labels = module.load()
nlabels = [module.label_names[ell] for ell in labels]
with open(fname, 'w') as ofile:
for f,n in zip(features, nlabels):
print >>ofile, "\t".join(map(str,f)+[n])
save_as_tsv('iris.tsv', milksets.iris)
save_as_tsv('seeds.tsv', milksets.seeds)
COLOUR_FIGURE = False
from matplotlib import pyplot as plt
from sklearn.datasets import load_iris
data = load_iris()
features = data['data']
feature_names = data['feature_names']
species = data['target_names'][data['target']]
setosa = (species == 'setosa')
features = features[~setosa]
species = species[~setosa]
virginica = species == 'virginica'
t = 1.75
p0,p1 = 3,2
if COLOUR_FIGURE:
area1c = (1.,.8,.8)
area2c = (.8,.8,1.)
else:
area1c = (1.,1,1)
area2c = (.7,.7,.7)
x0,x1 =[features[:,p0].min()*.9,features[:,p0].max()*1.1] #注意用列表对两个变量赋值, 列表内为两个变量
y0,y1 =[features[:,p1].min()*.9,features[:,p1].max()*1.1]
plt.fill_between([t,x1],[y0,y0],[y1,y1],color=area2c) #填充颜色
plt.fill_between([x0,t],[y0,y0],[y1,y1],color=area1c)
plt.plot([t,t],[y0,y1],'k--',lw=2)
plt.plot([t-.1,t-.1],[y0,y1],'k:',lw=2)
plt.scatter(features[virginica,p0], features[virginica,p1], c='b', marker='o')
plt.scatter(features[~virginica,p0], features[~virginica,p1], c='r', marker='x')
plt.ylim(y0,y1)
plt.xlim(x0,x1)
plt.xlabel(feature_names[p0])
plt.ylabel(feature_names[p1])
plt.savefig('../1400_02_02.png')
COLOUR_FIGURE = False
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib.colors import ListedColormap #颜色list
from load import load_dataset
import numpy as np
from knn import learn_model, apply_model, accuracy
feature_names = [
'area',
'perimeter',
'compactness',
'length of kernel',
'width of kernel',
'asymmetry coefficien',
'length of kernel groove',
]
def train_plot(features, labels):
y0,y1 = features[:,2].min()*.9, features[:,2].max()*1.1
x0,x1 = features[:,0].min()*.9, features[:,0].max()*1.1
X = np.linspace(x0,x1,100)
Y = np.linspace(y0,y1,100)
X,Y = np.meshgrid(X,Y) ##注意meshgrid的用法,格子
model = learn_model(1, features[:,(0,2)], np.array(labels))
C = apply_model(np.vstack([X.ravel(),Y.ravel()]).T, model).reshape(X.shape) #ravel是视图, 相比flatten
if COLOUR_FIGURE:
cmap = ListedColormap([(1.,.6,.6),(.6,1.,.6),(.6,.6,1.)]) ##listedColormap的用法
else:
cmap = ListedColormap([(1.,1.,1.),(.2,.2,.2),(.6,.6,.6)])
plt.xlim(x0,x1)
plt.ylim(y0,y1)
plt.xlabel(feature_names[0])
plt.ylabel(feature_names[2])
plt.pcolormesh(X,Y,C, cmap=cmap)
if COLOUR_FIGURE:
cmap = ListedColormap([(1.,.0,.0),(.0,1.,.0),(.0,.0,1.)])
plt.scatter(features[:,0], features[:,2], c=labels, cmap=cmap)
else:
for lab,ma in zip(range(3), "Do^"):
plt.plot(features[labels == lab,0], features[labels == lab,2], ma, c=(1.,1.,1.))
features,labels = load_dataset('seeds')
names = sorted(set(labels))
labels = np.array([names.index(ell) for ell in labels])
train_plot(features, labels)
plt.savefig('../1400_02_04.png')
features -= features.mean(0) #简洁的归一化方式
features /= features.std(0)
train_plot(features, labels)
plt.savefig('../1400_02_05.png')
from matplotlib import pyplot as plt
import numpy as np
from sklearn.datasets import load_iris
from threshold import learn_model, apply_model, accuracy
data = load_iris()
features = data['data']
labels = data['target_names'][data['target']]
setosa = (labels == 'setosa')
features = features[~setosa]
labels = labels[~setosa]
virginica = (labels == 'virginica')
testing = np.tile([True, False], 50) ##tile(A,(2,2,3))表示A的第一个维度重复3遍,第二个维度重复2遍,第三个维度重复2遍
training = ~testing #取反的简洁方式
model = learn_model(features[training], virginica[training])
train_error = accuracy(features[training], virginica[training], model)
test_error = accuracy(features[testing], virginica[testing], model)
print('''\
Training error was {0:.1%}.
Testing error was {1:.1%} (N = {2}).
'''.format(train_error, test_error, testing.sum())) # 注意此种字符串打印方式, 采用{0} 然后用.format()
import numpy as np
def learn_model(k, features, labels):
return k, features.copy(),labels.copy()
def plurality(xs):
from collections import defaultdict #注意此种import方式
counts = defaultdict(int) #dict
for x in xs:
counts[x] += 1
maxv = max(counts.values())
for k,v in counts.items():
if v == maxv:
return k
def apply_model(features, model):
k, train_feats, labels = model
results = []
for f in features:
label_dist = []
for t,ell in zip(train_feats, labels):
label_dist.append( (np.linalg.norm(f-t), ell) )
label_dist.sort(key=lambda d_ell: d_ell[0])
label_dist = label_dist[:k]
results.append(plurality([ell for _,ell in label_dist]))
return np.array(results)
def accuracy(features, labels, model):
preds = apply_model(features, model)
return np.mean(preds == labels) #简洁的方式计算氙灯还是不想等,然后进行处理
import numpy as np
def load_dataset(dataset_name):
'''
data,labels = load_dataset(dataset_name)
Load a given dataset
Returns
-------
data : numpy ndarray
labels : list of str
'''
data = []
labels = []
with open('../data/{0}.tsv'.format(dataset_name)) as ifile: # 注意with open as 此种格式写法
for line in ifile:
tokens = line.strip().split('\t')
data.append([float(tk) for tk in tokens[:-1]])
labels.append(tokens[-1])
data = np.array(data)
labels = np.array(labels)
return data, labels
from load import load_dataset
import numpy as np
from knn import learn_model, apply_model, accuracy
features,labels = load_dataset('seeds')
def cross_validate(features, labels):
error = 0.0
for fold in range(10):
training = np.ones(len(features), bool) #注意用ones,还有组合[fold::10] 进行交叉验证的取样操作
training[fold::10] = 0
testing = ~training #简洁的方式取测试集合
model = learn_model(1, features[training], labels[training])
test_error = accuracy(features[testing], labels[testing], model)
error += test_error
return error/ 10.0
error = cross_validate(features, labels)
print('Ten fold cross-validated error was {0:.1%}.'.format(error))
features -= features.mean(0)
features /= features.std(0)
error = cross_validate(features, labels)
print('Ten fold cross-validated error after z-scoring was {0:.1%}.'.format(error))
from load import load_dataset
import numpy as np
from threshold import learn_model, apply_model, accuracy
features,labels = load_dataset('seeds')
labels = labels == 'Canadian'
error = 0.0
for fold in range(10):
training = np.ones(len(features), bool)
training[fold::10] = 0
testing = ~training
model = learn_model(features[training], labels[training])
test_error = accuracy(features[testing], labels[testing], model)
error += test_error
error /= 10.0
print('Ten fold cross-validated error was {0:.1%}.'.format(error))
import numpy as np
from sklearn.datasets import load_iris
data = load_iris()
features = data['data']
labels = data['target_names'][data['target']]
plength = features[:,2]
is_setosa = (labels == 'setosa')
print('Maximum of setosa: {0}.'.format(plength[is_setosa].max()))
print('Minimum of others: {0}.'.format(plength[~is_setosa].min()))
from matplotlib import pyplot as plt
from sklearn.datasets import load_iris
data = load_iris()
features = data['data']
labels = data['target_names'][data['target']]
setosa = (labels == 'setosa')
features = features[~setosa]
labels = labels[~setosa]
virginica = (labels == 'virginica') #注意此种写法,相等后赋值, 最后的结果是一个布尔数组
best_acc = -1.0
for fi in range(features.shape[1]):
thresh = features[:,fi].copy()
thresh.sort()
for t in thresh:
pred = (features[:,fi] > t)
acc = (pred == virginica).mean()
if acc > best_acc:
best_acc = acc
best_fi = fi
best_t = t
print('Best cut is {0} on feature {1}, which achieves accuracy of {2:.1%}.'.format(best_t,best_fi,best_acc))
import numpy as np
def learn_model(features, labels):
best_acc = -1.0
for fi in range(features.shape[1]):
thresh = features[:,fi].copy()
thresh.sort()
for t in thresh:
pred = (features[:,fi] > t)
acc = (pred == labels).mean()
if acc > best_acc:
best_acc = acc
best_fi = fi
best_t = t
return best_t, best_fi
def apply_model(features, model):
t, fi = model
return features[:,fi] > t
def accuracy(features, labels, model):
preds = apply_model(features, model)
return np.mean(preds == labels)
第三章
import os
import sys
import scipy as sp
from sklearn.feature_extraction.text import CountVectorizer
DIR = r"../data/toy"
posts = [open(os.path.join(DIR, f)).read() for f in os.listdir(DIR)] #注意,os.listdir, 读取文档
new_post = "imaging databases"
import nltk.stem #词干处理库
english_stemmer = nltk.stem.SnowballStemmer('english')
class StemmedCountVectorizer(CountVectorizer): # 继承类
def build_analyzer(self):
analyzer = super(StemmedCountVectorizer, self).build_analyzer()
return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
# vectorizer = CountVectorizer(min_df=1, stop_words='english',
# preprocessor=stemmer)
vectorizer = StemmedCountVectorizer(min_df=1, stop_words='english')
from sklearn.feature_extraction.text import TfidfVectorizer
class StemmedTfidfVectorizer(TfidfVectorizer):
def build_analyzer(self):
analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
vectorizer = StemmedTfidfVectorizer(
min_df=1, stop_words='english', charset_error='ignore')
print(vectorizer)
X_train = vectorizer.fit_transform(posts)
num_samples, num_features = X_train.shape
print("#samples: %d, #features: %d" % (num_samples, num_features))
new_post_vec = vectorizer.transform([new_post])
print(new_post_vec, type(new_post_vec))
print(new_post_vec.toarray())
print(vectorizer.get_feature_names())
def dist_raw(v1, v2):
delta = v1 - v2
return sp.linalg.norm(delta.toarray()) #linalg.norm 计算欧式距离
def dist_norm(v1, v2):
v1_normalized = v1 / sp.linalg.norm(v1.toarray())
v2_normalized = v2 / sp.linalg.norm(v2.toarray())
delta = v1_normalized - v2_normalized
return sp.linalg.norm(delta.toarray())
dist = dist_norm
best_dist = sys.maxsize #sys.maxsize python 3 中最大值
best_i = None
for i in range(0, num_samples):
post = posts[i]
if post == new_post:
continue
post_vec = X_train.getrow(i)
d = dist(post_vec, new_post_vec)
print("=== Post %i with dist=%.2f: %s" % (i, d, post))
if d < best_dist:
best_dist = d
best_i = i
print("Best post is %i with dist=%.2f" % (best_i, best_dist))
import sklearn.datasets
import scipy as sp
new_post = \
"""Disk drive problems. Hi, I have a problem with my hard disk.
After 1 year it is working only sporadically now.
I tried to format it, but now it doesn't boot any more.
Any ideas? Thanks.
"""
MLCOMP_DIR = r"P:\Dropbox\pymlbook\data"
groups = [
'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
'comp.sys.ma c.hardware', 'comp.windows.x', 'sci.space']
dataset = sklearn.datasets.load_mlcomp("20news-18828", "train",
mlcomp_root=MLCOMP_DIR,
categories=groups)
print("Number of posts:", len(dataset.filenames))
labels = dataset.target
num_clusters = 50 # sp.unique(labels).shape[0]
import nltk.stem
english_stemmer = nltk.stem.SnowballStemmer('english')
from sklearn.feature_extraction.text import TfidfVectorizer
class StemmedTfidfVectorizer(TfidfVectorizer):
def build_analyzer(self):
analyzer = super(TfidfVectorizer, self).build_analyzer()
return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
vectorizer = StemmedTfidfVectorizer(min_df=10, max_df=0.5,
# max_features=1000,
stop_words='english', charset_error='ignore'
)
vectorized = vectorizer.fit_transform(dataset.data)
num_samples, num_features = vectorized.shape
print("#samples: %d, #features: %d" % (num_samples, num_features))
from sklearn.cluster import KMeans
km = KMeans(n_clusters=num_clusters, init='k-means++', n_init=1,
verbose=1)
clustered = km.fit(vectorized)
from sklearn import metrics #许多评估函数在此包中
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand Index: %0.3f" %
metrics.adjusted_rand_score(labels, km.labels_))
print("Adjusted Mutual Information: %0.3f" %
metrics.adjusted_mutual_info_score(labels, km.labels_))
print(("Silhouette Coefficient: %0.3f" %
metrics.silhouette_score(vectorized, labels, sample_size=1000)))
new_post_vec = vectorizer.transform([new_post])
new_post_label = km.predict(new_post_vec)[0]
similar_indices = (km.labels_ == new_post_label).nonzero()[0]
similar = []
for i in similar_indices:
dist = sp.linalg.norm((new_post_vec - vectorized[i]).toarray())
similar.append((dist, dataset.data[i]))
similar = sorted(similar)
import pdb
pdb.set_trace()
show_at_1 = similar[0]
show_at_2 = similar[len(similar) / 2]
show_at_3 = similar[-1]
print(show_at_1)
print(show_at_2)
print(show_at_3)
import scipy as sp
def tfidf(t, d, D):
tf = float(d.count(t)) / sum(d.count(w) for w in set(d))
idf = sp.log(float(len(D)) / (len([doc for doc in D if t in doc])))
return tf * idf
a, abb, abc = ["a"], ["a", "b", "b"], ["a", "b", "c"]
D = [a, abb, abc]
print(tfidf("a", a, D))
print(tfidf("b", abb, D))
print(tfidf("a", abc, D))
print(tfidf("b", abc, D))
print(tfidf("c", abc, D))
import os
import scipy as sp
from scipy.stats import norm
from matplotlib import pylab
from sklearn.cluster import KMeans
seed = 2
sp.random.seed(seed) # to reproduce the data later on
num_clusters = 3
def plot_clustering(x, y, title, mx=None, ymax=None, xmin=None, km=None):
pylab.figure(num=None, figsize=(8, 6))
if km:
pylab.scatter(x, y, s=50, c=km.predict(list(zip(x, y)))) # 此处使用zip
else:
pylab.scatter(x, y, s=50)
pylab.title(title)
pylab.xlabel("Occurrence word 1")
pylab.ylabel("Occurrence word 2")
# pylab.xticks([w*7*24 for w in range(10)], ['week %i'%w for w in range(10)])
pylab.autoscale(tight=True)
pylab.ylim(ymin=0, ymax=1)
pylab.xlim(xmin=0, xmax=1)
pylab.grid(True, linestyle='-', color='0.75')
return pylab
xw1 = norm(loc=0.3, scale=.15).rvs(20) #rvs随机数的个数
yw1 = norm(loc=0.3, scale=.15).rvs(20)
xw2 = norm(loc=0.7, scale=.15).rvs(20)
yw2 = norm(loc=0.7, scale=.15).rvs(20)
xw3 = norm(loc=0.2, scale=.15).rvs(20)
yw3 = norm(loc=0.8, scale=.15).rvs(20)
x = sp.append(sp.append(xw1, xw2), xw3)
y = sp.append(sp.append(yw1, yw2), yw3)
i = 1
plot_clustering(x, y, "Vectors")
pylab.savefig(os.path.join("..", "1400_03_0%i.png" % i))
pylab.clf()
i += 1
mx, my = sp.meshgrid(sp.arange(0, 1, 0.001), sp.arange(0, 1, 0.001))
km = KMeans(init='random', n_clusters=num_clusters, verbose=1,
n_init=1, max_iter=1,
random_state=seed)
km.fit(sp.array(list(zip(x, y))))
Z = km.predict(sp.c_[mx.ravel(), my.ravel()]).reshape(mx.shape)
plot_clustering(x, y, "Clustering iteration 1", km=km)
pylab.imshow(Z, interpolation='nearest', ##imshow图片显示?
extent=(mx.min(), mx.max(), my.min(), my.max()),
cmap=pylab.cm.Blues,
aspect='auto', origin='lower')
c1a, c1b, c1c = km.cluster_centers_
pylab.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1],
marker='x', linewidth=2, s=100, color='black')
pylab.savefig(os.path.join("..", "1400_03_0%i.png" % i))
pylab.clf()
i += 1
#################### 2 iterations ####################
km = KMeans(init='random', n_clusters=num_clusters, verbose=1,
n_init=1, max_iter=2,
random_state=seed)
km.fit(sp.array(list(zip(x, y))))
Z = km.predict(sp.c_[mx.ravel(), my.ravel()]).reshape(mx.shape)
plot_clustering(x, y, "Clustering iteration 2", km=km)
pylab.imshow(Z, interpolation='nearest',
extent=(mx.min(), mx.max(), my.min(), my.max()),
cmap=pylab.cm.Blues,
aspect='auto', origin='lower')
c2a, c2b, c2c = km.cluster_centers_
pylab.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1],
marker='x', linewidth=2, s=100, color='black')
# import pdb;pdb.set_trace()
pylab.gca().add_patch(
pylab.Arrow(c1a[0], c1a[1], c2a[0] - c1a[0], c2a[1] - c1a[1], width=0.1))
pylab.gca().add_patch(
pylab.Arrow(c1b[0], c1b[1], c2b[0] - c1b[0], c2b[1] - c1b[1], width=0.1))
pylab.gca().add_patch(
pylab.Arrow(c1c[0], c1c[1], c2c[0] - c1c[0], c2c[1] - c1c[1], width=0.1))
pylab.savefig(os.path.join("..", "1400_03_0%i.png" % i))
pylab.clf()
i += 1
#################### 3 iterations ####################
km = KMeans(init='random', n_clusters=num_clusters, verbose=1,
n_init=1, max_iter=10,
random_state=seed)
km.fit(sp.array(list(zip(x, y))))
Z = km.predict(sp.c_[mx.ravel(), my.ravel()]).reshape(mx.shape)
plot_clustering(x, y, "Clustering iteration 10", km=km)
pylab.imshow(Z, interpolation='nearest',
extent=(mx.min(), mx.max(), my.min(), my.max()),
cmap=pylab.cm.Blues,
aspect='auto', origin='lower')
pylab.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1],
marker='x', linewidth=2, s=100, color='black')
pylab.savefig(os.path.join("..", "1400_03_0%i.png" % i))
pylab.clf()
i += 1
第四章
from __future__ import print_function
from gensim import corpora, models, similarities #LDA主题分析包
from mpltools import style
import matplotlib.pyplot as plt
import numpy as np
from os import path
style.use('ggplot') #画图风格
if not path.exists('./data/ap/ap.dat'): #检测路径存在方式 os.exists
print('Error: Expected data to be present at data/ap/')
corpus = corpora.BleiCorpus('./data/ap/ap.dat', './data/ap/vocab.txt')
model = models.ldamodel.LdaModel(corpus, num_topics=100, id2word=corpus.id2word, alpha=None)
for ti in xrange(84):
words = model.show_topic(ti, 64)
tf = sum(f for f,w in words)
print('\n'.join('{}:{}'.format(w, int(1000.*f/tf)) for f,w in words))
print()
print()
print()
thetas = [model[c] for c in corpus]
plt.hist([len(t) for t in thetas], np.arange(42))
plt.ylabel('Nr of documents')
plt.xlabel('Nr of topics')
plt.savefig('../1400OS_04_01+.png')
model1 = models.ldamodel.LdaModel(corpus, num_topics=100, id2word=corpus.id2word, alpha=1.)
thetas1 = [model1[c] for c in corpus]
#model8 = models.ldamodel.LdaModel(corpus, num_topics=100, id2word=corpus.id2word, alpha=1.e-8)
#thetas8 = [model8[c] for c in corpus]
plt.clf()
plt.hist([[len(t) for t in thetas], [len(t) for t in thetas1]], np.arange(42))
plt.ylabel('Nr of documents')
plt.xlabel('Nr of topics')
plt.text(9,223, r'default alpha')
plt.text(26,156, 'alpha=1.0')
plt.savefig('../1400OS_04_02+.png')
import nltk.corpus
import milk
import numpy as np
import string
from gensim import corpora, models, similarities
import sklearn.datasets
import nltk.stem
from collections import defaultdict
english_stemmer = nltk.stem.SnowballStemmer('english')
stopwords = set(nltk.corpus.stopwords.words('english'))
stopwords.update(['from:', 'subject:', 'writes:', 'writes'])
class DirectText(corpora.textcorpus.TextCorpus):
def get_texts(self):
return self.input
def __len__(self):
return len(self.input)
dataset = sklearn.datasets.load_mlcomp("20news-18828", "train",
mlcomp_root='../data')
otexts = dataset.data
texts = dataset.data
texts = [t.decode('utf-8', 'ignore') for t in texts]
texts = [t.split() for t in texts]
texts = [map(lambda w: w.lower(), t) for t in texts] #注意使用map lambda 在列表推导式中 , 结果处理
texts = [filter(lambda s : not len(set("+-.?!()>@012345689") & set(s)), t) for t in texts]
texts = [filter(lambda s : (len(s) > 3) and (s not in stopwords), t) for t in texts]
texts = [map(english_stemmer.stem,t) for t in texts]
usage = defaultdict(int)
for t in texts:
for w in set(t):
usage[w] += 1
limit = len(texts)/10
too_common = [w for w in usage if usage[w] > limit]
too_common = set(too_common)
texts = [filter(lambda s : s not in too_common, t) for t in texts]
corpus = DirectText(texts)
dictionary = corpus.dictionary
try:
dictionary['computer']
except:
pass
model = models.ldamodel.LdaModel(corpus, num_topics=100, id2word=dictionary.id2token)
thetas = np.zeros((len(texts),100))
for i,c in enumerate(corpus):
for ti,v in model[c]:
thetas[i,ti] += v
distances = milk.unsupervised.pdist(thetas) #milk库
large = distances.max() + 1
for i in xrange(len(distances)): distances[i,i] = large
print otexts[1]
print
print
print
print otexts[distances[1].argmin()]
from __future__ import print_function
import numpy as np
import logging, gensim
logging.basicConfig(
format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
id2word = gensim.corpora.Dictionary.load_from_text('data/wiki_en_output_wordids.txt')
mm = gensim.corpora.MmCorpus('data/wiki_en_output_tfidf.mm')
model = gensim.models.ldamodel.LdaModel(
corpus=mm,
id2word=id2word,
num_topics=100,
update_every=1,
chunksize=10000,
passes=1)
model.save('wiki_lda.pkl')
topics = [model[doc] for doc in mm]
lens = np.array([len(t) for t in topics])
print(np.mean(lens <= 10))
print(np.mean(lens))
counts = np.zeros(100)
for doc_top in topics:
for ti,_ in doc_toc:
counts[ti] += 1
for doc_top in topics:
for ti,_ in doc_top:
counts[ti] += 1
words = model.show_topic(counts.argmax(), 64)
print(words)
print()
print()
print()
words = model.show_topic(counts.argmin(), 64)
print(words)
print()
print()
print()
第五章
import os
try:
import ujson as json # UltraJSON if available 注意try, except 用在import
except:
import json
import sys
from collections import defaultdict
try:
import enchant
except:
print(
"Enchant is not installed. You can get it from http://packages.python.org/pyenchant/. Exitting...")
sys.exit(1)
from data import chosen, chosen_meta, filtered, filtered_meta
filtered_meta = json.load(open(filtered_meta, "r"))
speller = enchant.Dict("en_US")
def misspelled_fraction(p):
tokens = p.split()
if not tokens:
return 0.0
return 1 - float(sum(speller.check(t) for t in tokens)) / len(tokens) ## 在此处使用for in,在函数sum内部,类似列表表达式,
def data(filename, col=None):
for line in open(filename, "r"):
data = line.strip().split("\t")
# check format
Id, ParentId, IsAccepted, TimeToAnswer, Score, Text, NumTextTokens, NumCodeLines, LinkCount, NumImages = data
if col:
yield data[col] #生成器, 注意生成器一定和for in 等循环之类的在一起使用?
else:
yield data
posts_to_keep = set()
found_questions = 0
num_qestion_sample = 1000
# keep the best and worst, but only if we have one with positive and one with negative score
# filter_method = "negative_positive"
# if true, only keep the lowest scoring answer per class in addition to the accepted one
# filter_method = "only_one_per_class "
# if not None, specifies the number of unaccepted per question
# filter_method = "sample_per_question"
filter_method = "negative_positive" # warning: this does not retrieve many!
# filter_method = "only_one_per_class"
MaxAnswersPerQuestions = 10 # filter_method == "sample_per_question"
# filter_method = "all"
# equal share of questions that are unanswered and those that are answered
# filter_method = "half-half"
unaccepted_scores = {}
has_q_accepted_a = {}
num_q_with_accepted_a = 0
num_q_without_accepted_a = 0
for ParentId, posts in filtered_meta.items():
assert ParentId != -1
if len(posts) < 2:
continue
ParentId = int(ParentId)
AllIds = set([ParentId])
AcceptedId = None
UnacceptedId = None
UnacceptedIds = []
UnacceptedScore = sys.maxsize
NegativeScoreIds = []
PositiveScoreIds = []
if filter_method == "half-half":
has_accepted_a = False
for post in posts:
Id, IsAccepted, TimeToAnswer, Score = post
if IsAccepted:
has_accepted_a = True
break
has_q_accepted_a[ParentId] = has_accepted_a
if has_accepted_a:
if num_q_with_accepted_a < num_qestion_sample / 2:
num_q_with_accepted_a += 1
posts_to_keep.add(ParentId)
else:
if num_q_without_accepted_a < num_qestion_sample / 2:
num_q_without_accepted_a += 1
posts_to_keep.add(ParentId)
if num_q_without_accepted_a + num_q_with_accepted_a > num_qestion_sample:
assert -1 not in posts_to_keep
break
else:
for post in posts:
Id, IsAccepted, TimeToAnswer, Score = post
if filter_method == "all":
AllIds.add(int(Id))
elif filter_method == "only_one_per_class":
if IsAccepted:
AcceptedId = Id
elif Score < UnacceptedScore:
UnacceptedScore = Score
UnacceptedId = Id
elif filter_method == "sample_per_question":
if IsAccepted:
AcceptedId = Id
else:
UnacceptedIds.append(Id)
elif filter_method == "negative_positive":
if Score < 0:
NegativeScoreIds.append((Score, Id))
elif Score > 0:
PositiveScoreIds.append((Score, Id))
else:
raise ValueError(filter_method)
added = False
if filter_method == "all":
posts_to_keep.update(AllIds)
added = True
elif filter_method == "only_one_per_class":
if AcceptedId is not None and UnacceptedId is not None:
posts_to_keep.add(ParentId)
posts_to_keep.add(AcceptedId)
posts_to_keep.add(UnacceptedId)
added = True
elif filter_method == "sample_per_question":
if AcceptedId is not None and UnacceptedIds is not None: ##注意python中None 不是零,不是NULL,不是空字符串
posts_to_keep.add(ParentId)
posts_to_keep.add(AcceptedId)
posts_to_keep.update(UnacceptedIds[:MaxAnswersPerQuestions])
added = True
elif filter_method == "negative_positive":
if PositiveScoreIds and NegativeScoreIds:
posts_to_keep.add(ParentId)
posScore, posId = sorted(PositiveScoreIds)[-1]
posts_to_keep.add(posId)
negScore, negId = sorted(NegativeScoreIds)[0]
posts_to_keep.add(negId)
print("%i: %i/%i %i/%i" % (ParentId, posId,
posScore, negId, negScore))
added = True
if added:
found_questions += 1
if num_qestion_sample and found_questions >= num_qestion_sample:
break
total = 0
kept = 0
already_written = set()
chosen_meta_dict = defaultdict(dict)
with open(chosen, "w") as f:
for line in data(filtered):
strId, ParentId, IsAccepted, TimeToAnswer, Score, Text, NumTextTokens, NumCodeLines, LinkCount, NumImages = line
Text = Text.strip()
total += 1
Id = int(strId)
if Id in posts_to_keep:
if Id in already_written:
print(Id, "is already written")
continue
if kept % 100 == 0:
print(kept)
# setting meta info
post = chosen_meta_dict[Id]
post['ParentId'] = int(ParentId)
post['IsAccepted'] = int(IsAccepted)
post['TimeToAnswer'] = int(TimeToAnswer)
post['Score'] = int(Score)
post['NumTextTokens'] = int(NumTextTokens)
post['NumCodeLines'] = int(NumCodeLines)
post['LinkCount'] = int(LinkCount)
post['MisSpelledFraction'] = misspelled_fraction(Text)
post['NumImages'] = int(NumImages)
post['idx'] = kept # index into the file
if int(ParentId) == -1:
q = chosen_meta_dict[Id]
if not 'Answers' in q:
q['Answers'] = []
if filter_method == "half-half":
q['HasAcceptedAnswer'] = has_q_accepted_a[Id]
else:
q = chosen_meta_dict[int(ParentId)]
if int(IsAccepted) == 1:
assert 'HasAcceptedAnswer' not in q
q['HasAcceptedAnswer'] = True
if 'Answers' not in q:
q['Answers'] = [Id]
else:
q['Answers'].append(Id)
f.writelines("%s\t%s\n" % (Id, Text))
kept += 1
with open(chosen_meta, "w") as fm:
json.dump(chosen_meta_dict, fm)
print("total=", total)
print("kept=", kept)
import time
start_time = time.time()
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve, roc_curve, auc
from sklearn.cross_validation import KFold
from sklearn import neighbors
from data import chosen, chosen_meta
from utils import plot_roc, plot_pr
from utils import plot_feat_importance
from utils import load_meta
from utils import fetch_posts
from utils import plot_feat_hist
from utils import plot_bias_variance
from utils import plot_k_complexity
# question Id -> {'features'->feature vector, 'answers'->[answer Ids]}, 'scores'->[scores]}
# scores will be added on-the-fly as the are not in meta
meta, id_to_idx, idx_to_id = load_meta(chosen_meta)
import nltk
# splitting questions into train (70%) and test(30%) and then take their
# answers
all_posts = list(meta.keys())
all_questions = [q for q, v in meta.items() if v['ParentId'] == -1]
all_answers = [q for q, v in meta.items() if v['ParentId'] != -1] # [:500]
feature_names = np.array((
'NumTextTokens',
'NumCodeLines',
'LinkCount',
'AvgSentLen',
'AvgWordLen',
'NumAllCaps',
'NumExclams',
'NumImages'
))
# activate the following for reduced feature space
"""
feature_names = np.array((
'NumTextTokens',
'LinkCount',
))
"""
def prepare_sent_features():
for pid, text in fetch_posts(chosen, with_index=True):
if not text:
meta[pid]['AvgSentLen'] = meta[pid]['AvgWordLen'] = 0
else:
sent_lens = [len(nltk.word_tokenize(
sent)) for sent in nltk.sent_tokenize(text)]
meta[pid]['AvgSentLen'] = np.mean(sent_lens)
meta[pid]['AvgWordLen'] = np.mean(
[len(w) for w in nltk.word_tokenize(text)])
meta[pid]['NumAllCaps'] = np.sum(
[word.isupper() for word in nltk.word_tokenize(text)])
meta[pid]['NumExclams'] = text.count('!')
prepare_sent_features()
def get_features(aid):
return tuple(meta[aid][fn] for fn in feature_names)
qa_X = np.asarray([get_features(aid) for aid in all_answers])
# Score > 0 tests => positive class is good answer
# Score <= 0 tests => positive class is poor answer
qa_Y = np.asarray([meta[aid]['Score'] > 0 for aid in all_answers])
classifying_answer = "good"
for idx, feat in enumerate(feature_names):
plot_feat_hist([(qa_X[:, idx], feat)])
"""
plot_feat_hist([(qa_X[:, idx], feature_names[idx]) for idx in [1,0]], 'feat_hist_two.png')
plot_feat_hist([(qa_X[:, idx], feature_names[idx]) for idx in [3,4,5,6]], 'feat_hist_four.png')
"""
avg_scores_summary = []
def measure(clf_class, parameters, name, data_size=None, plot=False):
start_time_clf = time.time()
if data_size is None:
X = qa_X
Y = qa_Y
else:
X = qa_X[:data_size]
Y = qa_Y[:data_size]
cv = KFold(n=len(X), n_folds=10, indices=True)
train_errors = []
test_errors = []
scores = []
roc_scores = []
fprs, tprs = [], []
pr_scores = []
precisions, recalls, thresholds = [], [], []
for train, test in cv:
X_train, y_train = X[train], Y[train]
X_test, y_test = X[test], Y[test]
clf = clf_class(**parameters) ##可变参数
clf.fit(X_train, y_train)
train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)
train_errors.append(1 - train_score)
test_errors.append(1 - test_score)
scores.append(test_score)
proba = clf.predict_proba(X_test)
label_idx = 1
fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, label_idx])
precision, recall, pr_thresholds = precision_recall_curve(
y_test, proba[:, label_idx])
roc_scores.append(auc(fpr, tpr))
fprs.append(fpr)
tprs.append(tpr)
pr_scores.append(auc(recall, precision))
precisions.append(precision)
recalls.append(recall)
thresholds.append(pr_thresholds)
print(classification_report(y_test, proba[:, label_idx] >
0.63, target_names=['not accepted', 'accepted']))
# get medium clone
scores_to_sort = pr_scores # roc_scores
medium = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
if plot:
#plot_roc(roc_scores[medium], name, fprs[medium], tprs[medium])
plot_pr(pr_scores[medium], name, precisions[medium], recalls[medium], classifying_answer + " answers")
if hasattr(clf, 'coef_'):
plot_feat_importance(feature_names, clf, name)
summary = (name,
np.mean(scores), np.std(scores),
np.mean(roc_scores), np.std(roc_scores),
np.mean(pr_scores), np.std(pr_scores),
time.time() - start_time_clf)
print(summary)
avg_scores_summary.append(summary)
precisions = precisions[medium]
recalls = recalls[medium]
thresholds = np.hstack(([0], thresholds[medium]))
idx80 = precisions >= 0.8
print("P=%.2f R=%.2f thresh=%.2f" % (precisions[idx80][0], recalls[
idx80][0], thresholds[idx80][0]))
return np.mean(train_errors), np.mean(test_errors)
def bias_variance_analysis(clf_class, parameters, name):
data_sizes = np.arange(60, 2000, 4)
train_errors = []
test_errors = []
for data_size in data_sizes:
train_error, test_error = measure(
clf_class, parameters, name, data_size=data_size)
train_errors.append(train_error)
test_errors.append(test_error)
plot_bias_variance(data_sizes, train_errors, test_errors, name, "Bias-Variance for '%s'" % name)
def k_complexity_analysis(clf_class, parameters):
ks = np.hstack((np.arange(1, 20), np.arange(21, 100, 5)))
train_errors = []
test_errors = []
for k in ks:
parameters['n_neighbors'] = k
train_error, test_error = measure(
clf_class, parameters, "%dNN" % k, data_size=2000)
train_errors.append(train_error)
test_errors.append(test_error)
plot_k_complexity(ks, train_errors, test_errors)
for k in [5]: #[5, 10, 40, 90]:
bias_variance_analysis(neighbors.KNeighborsClassifier, {'n_neighbors':k, 'warn_on_equidistant':False}, "%iNN"%k)
k_complexity_analysis(neighbors.KNeighborsClassifier, {'n_neighbors':k,
'warn_on_equidistant':False})
#measure(neighbors.KNeighborsClassifier, {'n_neighbors': k, 'p': 2,
#'warn_on_equidistant': False}, "%iNN" % k)
from sklearn.linear_model import LogisticRegression
for C in [0.1]: #[0.01, 0.1, 1.0, 10.0]:
name = "LogReg C=%.2f" % C
bias_variance_analysis(LogisticRegression, {'penalty':'l2', 'C':C}, name)
measure(LogisticRegression, {'penalty': 'l2', 'C': C}, name, plot=True)
print("=" * 50)
from operator import itemgetter
for s in reversed(sorted(avg_scores_summary, key=itemgetter(1))):
print("%-20s\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f" % s)
print("time spent:", time.time() - start_time)
import os
# DATA_DIR = r"C:\pymlbook-data\ch05"
DATA_DIR = r"/media/sf_C/pymlbook-data/ch05"
CHART_DIR = os.path.join("..", "charts")
filtered = os.path.join(DATA_DIR, "filtered.tsv")
filtered_meta = os.path.join(DATA_DIR, "filtered-meta.json")
chosen = os.path.join(DATA_DIR, "chosen.tsv")
chosen_meta = os.path.join(DATA_DIR, "chosen-meta.json")
import numpy as np
from scipy.stats import norm
from matplotlib import pyplot
np.random.seed(3)
num_per_class = 40
X = np.hstack((norm.rvs(2, size=num_per_class, scale=2),
norm.rvs(8, size=num_per_class, scale=3)))
y = np.hstack((np.zeros(num_per_class),
np.ones(num_per_class)))
def lr_model(clf, X):
return 1.0 / (1.0 + np.exp(-(clf.intercept_ + clf.coef_ * X)))
from sklearn.linear_model import LogisticRegression
logclf = LogisticRegression()
print(logclf)
logclf.fit(X.reshape(num_per_class * 2, 1), y) ## reshape
print(np.exp(logclf.intercept_), np.exp(logclf.coef_.ravel())) ##注意线性回归的截距和参数
print("P(x=-1)=%.2f\tP(x=7)=%.2f" % (lr_model(logclf, -1), lr_model(logclf, 7)))
X_test = np.arange(-5, 20, 0.1)
pyplot.figure(figsize=(10, 4))
pyplot.xlim((-5, 20))
pyplot.scatter(X, y, c=y)
pyplot.xlabel("feature value")
pyplot.ylabel("class")
pyplot.grid(True, linestyle='-', color='0.75')
pyplot.savefig("log_reg_example_data.png", bbox_inches="tight")
def lin_model(clf, X):
return clf.intercept_ + clf.coef_ * X
from sklearn.linear_model import LinearRegression
clf = LinearRegression()
print(clf)
clf.fit(X.reshape(num_per_class * 2, 1), y)
X_odds = np.arange(0, 1, 0.001)
pyplot.figure(figsize=(10, 4))
pyplot.subplot(1, 2, 1)
pyplot.scatter(X, y, c=y)
pyplot.plot(X_test, lin_model(clf, X_test))
pyplot.xlabel("feature value")
pyplot.ylabel("class")
pyplot.title("linear fit on original data")
pyplot.grid(True, linestyle='-', color='0.75')
X_ext = np.hstack((X, norm.rvs(20, size=100, scale=5)))
y_ext = np.hstack((y, np.ones(100)))
clf = LinearRegression()
clf.fit(X_ext.reshape(num_per_class * 2 + 100, 1), y_ext)
pyplot.subplot(1, 2, 2)
pyplot.scatter(X_ext, y_ext, c=y_ext)
pyplot.plot(X_ext, lin_model(clf, X_ext))
pyplot.xlabel("feature value")
pyplot.ylabel("class")
pyplot.title("linear fit on additional data")
pyplot.grid(True, linestyle='-', color='0.75')
pyplot.savefig("log_reg_log_linear_fit.png", bbox_inches="tight")
pyplot.figure(figsize=(10, 4))
pyplot.xlim((-5, 20))
pyplot.scatter(X, y, c=y)
pyplot.plot(X_test, lr_model(logclf, X_test).ravel())
pyplot.plot(X_test, np.ones(X_test.shape[0]) * 0.5, "--")
pyplot.xlabel("feature value")
pyplot.ylabel("class")
pyplot.grid(True, linestyle='-', color='0.75')
pyplot.savefig("log_reg_example_fitted.png", bbox_inches="tight")
X = np.arange(0, 1, 0.001)
pyplot.figure(figsize=(10, 4))
pyplot.subplot(1, 2, 1)
pyplot.xlim((0, 1))
pyplot.ylim((0, 10))
pyplot.plot(X, X / (1 - X))
pyplot.xlabel("P")
pyplot.ylabel("odds = P / (1-P)")
pyplot.grid(True, linestyle='-', color='0.75')
pyplot.subplot(1, 2, 2)
pyplot.xlim((0, 1))
pyplot.plot(X, np.log(X / (1 - X)))
pyplot.xlabel("P")
pyplot.ylabel("log(odds) = log(P / (1-P))")
pyplot.grid(True, linestyle='-', color='0.75')
pyplot.savefig("log_reg_log_odds.png", bbox_inches="tight")
import re
from operator import itemgetter #多条件排序
from collections import Mapping
import scipy.sparse as sp
from sklearn.base import BaseEstimator ##注意评估器选择函数
from sklearn.feature_extraction.text import strip_accents_ascii, strip_accents_unicode
import nltk
from collections import Counter
try:
import ujson as json # UltraJSON if available
except:
import json
poscache_filename = "poscache.json"
class PosCounter(Counter):
def __init__(self, iterable=(), normalize=True, poscache=None, **kwargs):
self.n_sents = 0
self.normalize = normalize
self.poscache = poscache
super(PosCounter, self).__init__(iterable, **kwargs)
def update(self, other):
"""Adds counts for elements in other"""
if isinstance(other, self.__class__):
self.n_sents += other.n_sents
for x, n in other.items():
self[x] += n
else:
for sent in other:
self.n_sents += 1
# import pdb;pdb.set_trace()
if self.poscache is not None:
if sent in self.poscache:
tags = self.poscache[sent]
else:
self.poscache[sent] = tags = nltk.pos_tag(
nltk.word_tokenize(sent))
else:
tags = nltk.pos_tag(nltk.word_tokenize(sent))
for x in tags:
tok, tag = x
self[tag] += 1
if self.normalize:
for x, n in self.items():
self[x] /= float(self.n_sents)
class PosTagFreqVectorizer(BaseEstimator):
"""
Convert a collection of raw documents to a matrix Pos tag frequencies
"""
def __init__(self, input='content', charset='utf-8',
charset_error='strict', strip_accents=None,
vocabulary=None,
normalize=True,
dtype=float):
self.input = input
self.charset = charset
self.charset_error = charset_error
self.strip_accents = strip_accents
if vocabulary is not None:
self.fixed_vocabulary = True
if not isinstance(vocabulary, Mapping):
vocabulary = dict((t, i) for i, t in enumerate(vocabulary))
self.vocabulary_ = vocabulary
else:
self.fixed_vocabulary = False
try:
self.poscache = json.load(open(poscache_filename, "r"))
except IOError:
self.poscache = {}
self.normalize = normalize
self.dtype = dtype
def write_poscache(self):
json.dump(self.poscache, open(poscache_filename, "w"))
def decode(self, doc):
"""Decode the input into a string of unicode symbols
The decoding strategy depends on the vectorizer parameters.
"""
if self.input == 'filename':
doc = open(doc, 'rb').read()
elif self.input == 'file':
doc = doc.read()
if isinstance(doc, bytes):
doc = doc.decode(self.charset, self.charset_error)
return doc
def build_preprocessor(self):
"""Return a function to preprocess the text before tokenization"""
# unfortunately python functools package does not have an efficient
# `compose` function that would have allowed us to chain a dynamic
# number of functions. However the however of a lambda call is a few
# hundreds of nanoseconds which is negligible when compared to the
# cost of tokenizing a string of 1000 chars for instance.
noop = lambda x: x
# accent stripping
if not self.strip_accents:
strip_accents = noop
elif hasattr(self.strip_accents, '__call__'):
strip_accents = self.strip_accents
elif self.strip_accents == 'ascii':
strip_accents = strip_accents_ascii
elif self.strip_accents == 'unicode':
strip_accents = strip_accents_unicode
else:
raise ValueError('Invalid value for "strip_accents": %s' %
self.strip_accents)
only_prose = lambda s: re.sub('<[^>]*>', '', s).replace("\n", " ")
return lambda x: strip_accents(only_prose(x))
def build_tokenizer(self):
"""Return a function that split a string in sequence of tokens"""
return nltk.sent_tokenize
def build_analyzer(self):
"""Return a callable that handles preprocessing and tokenization"""
preprocess = self.build_preprocessor()
tokenize = self.build_tokenizer()
return lambda doc: tokenize(preprocess(self.decode(doc)))
def _term_count_dicts_to_matrix(self, term_count_dicts):
i_indices = []
j_indices = []
values = []
vocabulary = self.vocabulary_
for i, term_count_dict in enumerate(term_count_dicts):
for term, count in term_count_dict.items():
j = vocabulary.get(term)
if j is not None:
i_indices.append(i)
j_indices.append(j)
values.append(count)
# free memory as we go
term_count_dict.clear()
shape = (len(term_count_dicts), max(vocabulary.values()) + 1)
spmatrix = sp.csr_matrix((values, (i_indices, j_indices)),
shape=shape, dtype=self.dtype)
return spmatrix
def fit(self, raw_documents, y=None):
"""Learn a vocabulary dictionary of all tokens in the raw documents
Parameters
----------
raw_documents: iterable
an iterable which yields either str, unicode or file objects
Returns
-------
self
"""
self.fit_transform(raw_documents)
return self
def fit_transform(self, raw_documents, y=None):
"""Learn the vocabulary dictionary and return the count vectors
This is more efficient than calling fit followed by transform.
Parameters
----------
raw_documents: iterable
an iterable which yields either str, unicode or file objects
Returns
-------
vectors: array, [n_samples, n_features]
"""
if self.fixed_vocabulary:
# No need to fit anything, directly perform the transformation.
# We intentionally don't call the transform method to make it
# fit_transform overridable without unwanted side effects in
# TfidfVectorizer
analyze = self.build_analyzer()
term_counts_per_doc = [PosCounter(analyze(doc), normalize=self.normalize, poscache=self.poscache)
for doc in raw_documents]
return self._term_count_dicts_to_matrix(term_counts_per_doc)
self.vocabulary_ = {}
# result of document conversion to term count dicts
term_counts_per_doc = []
term_counts = Counter()
analyze = self.build_analyzer()
for doc in raw_documents:
term_count_current = PosCounter(
analyze(doc), normalize=self.normalize, poscache=self.poscache)
term_counts.update(term_count_current)
term_counts_per_doc.append(term_count_current)
self.write_poscache()
terms = set(term_counts)
# store map from term name to feature integer index: we sort the term
# to have reproducible outcome for the vocabulary structure: otherwise
# the mapping from feature name to indices might depend on the memory
# layout of the machine. Furthermore sorted terms might make it
# possible to perform binary search in the feature names array.
self.vocabulary_ = dict(((t, i) for i, t in enumerate(sorted(terms))))
return self._term_count_dicts_to_matrix(term_counts_per_doc)
def transform(self, raw_documents):
"""Extract token counts out of raw text documents using the vocabulary
fitted with fit or the one provided in the constructor.
Parameters
----------
raw_documents: iterable
an iterable which yields either str, unicode or file objects
Returns
-------
vectors: sparse matrix, [n_samples, n_features]
"""
if not hasattr(self, 'vocabulary_') or len(self.vocabulary_) == 0:
raise ValueError("Vocabulary wasn't fitted or is empty!")
# raw_documents can be an iterable so we don't know its size in
# advance
# XXX @larsmans tried to parallelize the following loop with joblib.
# The result was some 20% slower than the serial version.
analyze = self.build_analyzer()
term_counts_per_doc = [Counter(analyze(doc)) for doc in raw_documents]
return self._term_count_dicts_to_matrix(term_counts_per_doc)
def get_feature_names(self):
"""Array mapping from feature integer indices to feature name"""
if not hasattr(self, 'vocabulary_') or len(self.vocabulary_) == 0:
raise ValueError("Vocabulary wasn't fitted or is empty!")
return [t for t, i in sorted(iter(self.vocabulary_.items()),
key=itemgetter(1))]
#
# This script filters the posts and keeps those posts that are or belong
# to a question that has been asked in 2011 or 2012.
#
import os
import re
try:
import ujson as json # UltraJSON if available
except:
import json
from dateutil import parser as dateparser
from operator import itemgetter
from xml.etree import cElementTree as etree
from collections import defaultdict
from data import DATA_DIR
filename = os.path.join(DATA_DIR, "posts-2011-12.xml")
filename_filtered = os.path.join(DATA_DIR, "filtered.tsv")
q_creation = {} # creation datetimes of questions
q_accepted = {} # id of accepted answer
meta = defaultdict(
list) # question -> [(answer Id, IsAccepted, TimeToAnswer, Score), ...]
# regegx to find code snippets
code_match = re.compile('(.*?)
', re.MULTILINE | re.DOTALL)
link_match = re.compile(
'(.*?)', re.MULTILINE | re.DOTALL)
img_match = re.compile('', re.MULTILINE | re.DOTALL)
tag_match = re.compile('<[^>]*>', re.MULTILINE | re.DOTALL)
def filter_html(s):
num_code_lines = 0
link_count_in_code = 0
code_free_s = s
num_images = len(img_match.findall(s))
# remove source code and count how many lines 正则表达式的用法
for match_str in code_match.findall(s):
num_code_lines += match_str.count('\n')
code_free_s = code_match.sub("", code_free_s)
# sometimes source code contain links, which we don't want to count
link_count_in_code += len(link_match.findall(match_str))
anchors = link_match.findall(s)
link_count = len(anchors)
link_count -= link_count_in_code
html_free_s = re.sub(
" +", " ", tag_match.sub('', code_free_s)).replace("\n", "")
link_free_s = html_free_s
for anchor in anchors:
if anchor.lower().startswith("http://"):
link_free_s = link_free_s.replace(anchor, '')
num_text_tokens = html_free_s.count(" ")
return link_free_s, num_text_tokens, num_code_lines, link_count, num_images
years = defaultdict(int)
num_questions = 0
num_answers = 0
def parsexml(filename):
global num_questions, num_answers
counter = 0
it = map(itemgetter(1),
iter(etree.iterparse(filename, events=('start',))))
root = next(it) # get posts element
for elem in it:
if counter % 100000 == 0:
print(counter)
counter += 1
if elem.tag == 'row':
creation_date = dateparser.parse(elem.get('CreationDate'))
# import pdb;pdb.set_trace()
# if creation_date.year < 2011:
# continue
Id = int(elem.get('Id'))
PostTypeId = int(elem.get('PostTypeId'))
Score = int(elem.get('Score'))
if PostTypeId == 1:
num_questions += 1
years[creation_date.year] += 1
ParentId = -1
TimeToAnswer = 0
q_creation[Id] = creation_date
accepted = elem.get('AcceptedAnswerId')
if accepted:
q_accepted[Id] = int(accepted)
IsAccepted = 0
elif PostTypeId == 2:
num_answers += 1
ParentId = int(elem.get('ParentId'))
if not ParentId in q_creation:
# question was too far in the past
continue
TimeToAnswer = (creation_date - q_creation[ParentId]).seconds
if ParentId in q_accepted:
IsAccepted = int(q_accepted[ParentId] == Id)
else:
IsAccepted = 0
meta[ParentId].append((Id, IsAccepted, TimeToAnswer, Score))
else:
continue
Text, NumTextTokens, NumCodeLines, LinkCount, NumImages = filter_html(
elem.get('Body'))
values = (Id, ParentId,
IsAccepted,
TimeToAnswer, Score,
Text,
NumTextTokens, NumCodeLines, LinkCount, NumImages)
yield values
root.clear() # preserve memory
with open(os.path.join(DATA_DIR, filename_filtered), "w") as f:
for item in parsexml(filename):
line = "\t".join(map(str, item))
f.write(line.encode("utf-8") + "\n")
with open(os.path.join(DATA_DIR, "filtered-meta.json"), "w") as f:
json.dump(meta, f)
print("years:", years)
print("#qestions: %i" % num_questions)
print("#answers: %i" % num_answers)
import os
try:
import ujson as json # UltraJSON if available
except:
import json
from matplotlib import pylab
import numpy as np
from data import CHART_DIR
def fetch_data(filename, col=None, line_count=-1, only_questions=False):
count = 0
for line in open(filename, "r"):
count += 1
if line_count > 0 and count > line_count:
break
data = Id, ParentId, IsQuestion, IsAccepted, TimeToAnswer, Score, Text, NumTextTokens, NumCodeLines, LinkCount, MisSpelledFraction = line.split(
"\t")
IsQuestion = int(IsQuestion)
if only_questions and not IsQuestion:
continue
if col:
if col < 6:
val = int(data[col])
else:
val = data[col]
yield val
else:
Id = int(Id)
assert Id >= 0, line
ParentId = int(ParentId)
IsAccepted = int(IsAccepted)
assert not IsQuestion == IsAccepted == 1, "%i %i --- %s" % (
IsQuestion, IsAccepted, line)
assert (ParentId == -1 and IsQuestion) or (
ParentId >= 0 and not IsQuestion), "%i %i --- %s" % (ParentId, IsQuestion, line)
TimeToAnswer = int(TimeToAnswer)
Score = int(Score)
NumTextTokens = int(NumTextTokens)
NumCodeLines = int(NumCodeLines)
LinkCount = int(LinkCount)
MisSpelledFraction = float(MisSpelledFraction)
yield Id, ParentId, IsQuestion, IsAccepted, TimeToAnswer, Score, Text, NumTextTokens, NumCodeLines, LinkCount, MisSpelledFraction
def fetch_posts(filename, with_index=True, line_count=-1):
count = 0
for line in open(filename, "r"):
count += 1
if line_count > 0 and count > line_count:
break
Id, Text = line.split("\t")
Text = Text.strip()
if with_index:
yield int(Id), Text
else:
yield Text
def load_meta(filename):
meta = json.load(open(filename, "r"))
keys = list(meta.keys())
# JSON only allows string keys, changing that to int
for key in keys:
meta[int(key)] = meta[key]
del meta[key]
# post Id to index in vectorized
id_to_idx = {}
# and back
idx_to_id = {}
for PostId, Info in meta.items():
id_to_idx[PostId] = idx = Info['idx']
idx_to_id[idx] = PostId
return meta, id_to_idx, idx_to_id
def plot_roc(auc_score, name, fpr, tpr):
pylab.figure(num=None, figsize=(6, 5))
pylab.plot([0, 1], [0, 1], 'k--')
pylab.xlim([0.0, 1.0])
pylab.ylim([0.0, 1.0])
pylab.xlabel('False Positive Rate')
pylab.ylabel('True Positive Rate')
pylab.title('Receiver operating characteristic (AUC=%0.2f)\n%s' % (
auc_score, name))
pylab.legend(loc="lower right")
pylab.grid(True, linestyle='-', color='0.75')
pylab.fill_between(tpr, fpr, alpha=0.5)
pylab.plot(fpr, tpr, lw=1)
pylab.savefig(os.path.join(CHART_DIR, "roc_" + name.replace(" ", "_")+ ".png"))
def plot_pr(auc_score, name, precision, recall, label=None):
pylab.figure(num=None, figsize=(6, 5))
pylab.xlim([0.0, 1.0])
pylab.ylim([0.0, 1.0])
pylab.xlabel('Recall')
pylab.ylabel('Precision')
pylab.title('P/R (AUC=%0.2f) / %s' % (auc_score, label))
pylab.fill_between(recall, precision, alpha=0.5)
pylab.grid(True, linestyle='-', color='0.75')
pylab.plot(recall, precision, lw=1)
filename = name.replace(" ", "_")
pylab.savefig(os.path.join(CHART_DIR, "pr_" + filename + ".png"))
def show_most_informative_features(vectorizer, clf, n=20):
c_f = sorted(zip(clf.coef_[0], vectorizer.get_feature_names()))
top = list(zip(c_f[:n], c_f[:-(n + 1):-1]))
for (c1, f1), (c2, f2) in top:
print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (c1, f1, c2, f2))
def plot_feat_importance(feature_names, clf, name):
pylab.figure(num=None, figsize=(6, 5))
coef_ = clf.coef_
important = np.argsort(np.absolute(coef_.ravel()))
f_imp = feature_names[important]
coef = coef_.ravel()[important]
inds = np.argsort(coef)
f_imp = f_imp[inds]
coef = coef[inds]
xpos = np.array(list(range(len(coef))))
pylab.bar(xpos, coef, width=1)
pylab.title('Feature importance for %s' % (name))
ax = pylab.gca()
ax.set_xticks(np.arange(len(coef)))
labels = ax.set_xticklabels(f_imp)
for label in labels:
label.set_rotation(90)
filename = name.replace(" ", "_")
pylab.savefig(os.path.join(
CHART_DIR, "feat_imp_%s.png" % filename), bbox_inches="tight")
def plot_feat_hist(data_name_list, filename=None):
if len(data_name_list)>1:
assert filename is not None
pylab.figure(num=None, figsize=(8, 6))
num_rows = 1 + (len(data_name_list) - 1) / 2
num_cols = 1 if len(data_name_list) == 1 else 2
pylab.figure(figsize=(5 * num_cols, 4 * num_rows))
for i in range(num_rows):
for j in range(num_cols):
pylab.subplot(num_rows, num_cols, 1 + i * num_cols + j)
x, name = data_name_list[i * num_cols + j]
pylab.title(name)
pylab.xlabel('Value')
pylab.ylabel('Fraction')
# the histogram of the data
max_val = np.max(x)
if max_val <= 1.0:
bins = 50
elif max_val > 50:
bins = 50
else:
bins = max_val
n, bins, patches = pylab.hist(
x, bins=bins, normed=1, facecolor='blue', alpha=0.75)
pylab.grid(True)
if not filename:
filename = "feat_hist_%s.png" % name.replace(" ", "_")
pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
def plot_bias_variance(data_sizes, train_errors, test_errors, name, title):
pylab.figure(num=None, figsize=(6, 5))
pylab.ylim([0.0, 1.0])
pylab.xlabel('Data set size')
pylab.ylabel('Error')
pylab.title("Bias-Variance for '%s'" % name)
pylab.plot(
data_sizes, test_errors, "--", data_sizes, train_errors, "b-", lw=1)
pylab.legend(["train error", "test error"], loc="upper right")
pylab.grid(True, linestyle='-', color='0.75')
pylab.savefig(os.path.join(CHART_DIR, "bv_" + name.replace(" ", "_") + ".png"), bbox_inches="tight")
def plot_k_complexity(ks, train_errors, test_errors):
pylab.figure(num=None, figsize=(6, 5))
pylab.ylim([0.0, 1.0])
pylab.xlabel('k')
pylab.ylabel('Error')
pylab.title('Errors for for different values of k')
pylab.plot(
ks, test_errors, "--", ks, train_errors, "-", lw=1)
pylab.legend(["train error", "test error"], loc="upper right")
pylab.grid(True, linestyle='-', color='0.75')
pylab.savefig(os.path.join(CHART_DIR, "kcomplexity.png"), bbox_inches="tight")
第六章
import os
import collections
from matplotlib import pylab
import numpy as np
DATA_DIR = os.path.join("..", "data")
CHART_DIR = os.path.join("..", "charts")
import csv
import json
def tweak_labels(Y, pos_sent_list):
pos = Y == pos_sent_list[0] #注意此种写法,最后赋值的是逻辑值
for sent_label in pos_sent_list[1:]:
pos |= Y == sent_label #|= 注意竖杠和等于号在一起, a|=b 等价于a=a|b 按位或
Y = np.zeros(Y.shape[0])
Y[pos] = 1
Y = Y.astype(int) #类型转化
return Y
def load_sanders_data(dirname=".", line_count=-1):
count = 0
topics = []
labels = []
tweets = []
with open(os.path.join(DATA_DIR, dirname, "corpus.csv"), "r") as csvfile:
metareader = csv.reader(csvfile, delimiter=',', quotechar='"') #注意格式
for line in metareader:
count += 1
if line_count > 0 and count > line_count:
break
topic, label, tweet_id = line
# import vimpdb;vimpdb.set_trace()
tweet_fn = os.path.join(
DATA_DIR, dirname, 'rawdata', '%s.json' % tweet_id)
tweet = json.load(open(tweet_fn, "r"))
if 'text' in tweet and tweet['user']['lang']=="en":
topics.append(topic)
labels.append(label)
tweets.append(tweet['text'])
tweets = np.asarray(tweets)
labels = np.asarray(labels)
# return topics, tweets, labels
return tweets, labels
def load_kaggle_data(filename="kaggle/training.txt", line_count=-1):
count = 0
labels = []
texts = []
read_texts = set([])
for line in open(os.path.join(DATA_DIR, filename), "r"):
count += 1
if line_count > 0 and count > line_count:
break
label, text = line.split("\t")
# Some tweets occur multiple times, so we have to
# remove them to not bias the training set.
if text in read_texts:
continue
read_texts.add(text)
labels.append(label)
texts.append(text)
texts = np.asarray(texts)
labels = np.asarray(labels, dtype=np.int)
return texts, labels
def plot_pr(auc_score, name, phase, precision, recall, label=None):
pylab.clf() #清除图像
pylab.figure(num=None, figsize=(5, 4))
pylab.grid(True)
pylab.fill_between(recall, precision, alpha=0.5)
pylab.plot(recall, precision, lw=1)
pylab.xlim([0.0, 1.0])
pylab.ylim([0.0, 1.0])
pylab.xlabel('Recall')
pylab.ylabel('Precision')
pylab.title('P/R curve (AUC=%0.2f) / %s' % (auc_score, label))
filename = name.replace(" ", "_")
pylab.savefig(os.path.join(CHART_DIR, "pr_%s_%s.png"%(filename, phase)), bbox_inches="tight")
def show_most_informative_features(vectorizer, clf, n=20):
c_f = sorted(zip(clf.coef_[0], vectorizer.get_feature_names()))
top = zip(c_f[:n], c_f[:-(n + 1):-1])
for (c1, f1), (c2, f2) in top:
print "\t%.4f\t%-15s\t\t%.4f\t%-15s" % (c1, f1, c2, f2)
def plot_log():
pylab.clf()
pylab.figure(num=None, figsize=(6, 5))
x = np.arange(0.001, 1, 0.001)
y = np.log(x)
pylab.title('Relationship between probabilities and their logarithm')
pylab.plot(x, y)
pylab.grid(True)
pylab.xlabel('P')
pylab.ylabel('log(P)')
filename = 'log_probs.png'
pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
def plot_feat_importance(feature_names, clf, name):
pylab.clf()
coef_ = clf.coef_
important = np.argsort(np.absolute(coef_.ravel()))
f_imp = feature_names[important]
coef = coef_.ravel()[important]
inds = np.argsort(coef)
f_imp = f_imp[inds]
coef = coef[inds]
xpos = np.array(range(len(coef)))
pylab.bar(xpos, coef, width=1)
pylab.title('Feature importance for %s' % (name))
ax = pylab.gca()
ax.set_xticks(np.arange(len(coef)))
labels = ax.set_xticklabels(f_imp)
for label in labels:
label.set_rotation(90)
filename = name.replace(" ", "_")
pylab.savefig(os.path.join(
CHART_DIR, "feat_imp_%s.png" % filename), bbox_inches="tight")
def plot_feat_hist(data_name_list, filename=None):
pylab.clf()
# import pdb;pdb.set_trace()
num_rows = 1 + (len(data_name_list) - 1) / 2
num_cols = 1 if len(data_name_list) == 1 else 2
pylab.figure(figsize=(5 * num_cols, 4 * num_rows))
for i in range(num_rows):
for j in range(num_cols):
pylab.subplot(num_rows, num_cols, 1 + i * num_cols + j)
x, name = data_name_list[i * num_cols + j]
pylab.title(name)
pylab.xlabel('Value')
pylab.ylabel('Density')
# the histogram of the data
max_val = np.max(x)
if max_val <= 1.0:
bins = 50
elif max_val > 50:
bins = 50
else:
bins = max_val
n, bins, patches = pylab.hist(
x, bins=bins, normed=1, facecolor='green', alpha=0.75)
pylab.grid(True)
if not filename:
filename = "feat_hist_%s.png" % name
pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
def plot_bias_variance(data_sizes, train_errors, test_errors, name):
pylab.clf()
pylab.ylim([0.0, 1.0])
pylab.xlabel('Data set size')
pylab.ylabel('Error')
pylab.title("Bias-Variance for '%s'" % name)
pylab.plot(
data_sizes, train_errors, "-", data_sizes, test_errors, "--", lw=1)
pylab.legend(["train error", "test error"], loc="upper right")
pylab.grid()
pylab.savefig(os.path.join(CHART_DIR, "bv_" + name + ".png"))
def load_sent_word_net():
sent_scores = collections.defaultdict(list)
with open(os.path.join(DATA_DIR, "SentiWordNet_3.0.0_20130122.txt"), "r") as csvfile:
reader = csv.reader(csvfile, delimiter='\t', quotechar='"')
for line in reader:
if line[0].startswith("#"): #注意用法
continue
if len(line)==1:
continue
POS,ID,PosScore,NegScore,SynsetTerms,Gloss = line
if len(POS)==0 or len(ID)==0:
continue
#print POS,PosScore,NegScore,SynsetTerms
for term in SynsetTerms.split(" "):
term = term.split("#")[0] # drop #number at the end of every term
term = term.replace("-", " ").replace("_", " ") #两个replace 连用
key = "%s/%s"%(POS,term.split("#")[0])
sent_scores[key].append((float(PosScore), float(NegScore)))
for key, value in sent_scores.iteritems():
sent_scores[key] = np.mean(value, axis=0)
return sent_scores
def log_false_positives(clf, X, y, name):
with open("FP_"+name.replace(" ", "_")+".tsv", "w") as f:
false_positive = clf.predict(X)!=y
for tweet, false_class in zip(X[false_positive], y[false_positive]):
f.write("%s\t%s\n"%(false_class, tweet.encode("ascii", "ignore")))
if __name__ == '__main__':
plot_log()
import time
start_time = time.time()
import numpy as np
from sklearn.metrics import precision_recall_curve, roc_curve, auc
from sklearn.cross_validation import ShuffleSplit
from utils import plot_pr
from utils import load_sanders_data
from utils import tweak_labels
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline #管道机制
from sklearn.naive_bayes import MultinomialNB
def create_ngram_model():
tfidf_ngrams = TfidfVectorizer(ngram_range=(1, 3),
analyzer="word", binary=False)
clf = MultinomialNB()
pipeline = Pipeline([('vect', tfidf_ngrams), ('clf', clf)])
return pipeline
def train_model(clf_factory, X, Y, name="NB ngram", plot=False):
cv = ShuffleSplit(
n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
train_errors = []
test_errors = []
scores = []
pr_scores = []
precisions, recalls, thresholds = [], [], []
for train, test in cv:
X_train, y_train = X[train], Y[train]
X_test, y_test = X[test], Y[test]
clf = clf_factory()
clf.fit(X_train, y_train)
train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)
train_errors.append(1 - train_score)
test_errors.append(1 - test_score)
scores.append(test_score)
proba = clf.predict_proba(X_test)
fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
precision, recall, pr_thresholds = precision_recall_curve(
y_test, proba[:, 1])
pr_scores.append(auc(recall, precision))
precisions.append(precision)
recalls.append(recall)
thresholds.append(pr_thresholds)
scores_to_sort = pr_scores
median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
if plot:
plot_pr(pr_scores[median], name, "01", precisions[median],
recalls[median], label=name)
summary = (np.mean(scores), np.std(scores),
np.mean(pr_scores), np.std(pr_scores))
print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary
return np.mean(train_errors), np.mean(test_errors)
def print_incorrect(clf, X, Y):
Y_hat = clf.predict(X)
wrong_idx = Y_hat != Y
X_wrong = X[wrong_idx]
Y_wrong = Y[wrong_idx]
Y_hat_wrong = Y_hat[wrong_idx]
for idx in xrange(len(X_wrong)):
print "clf.predict('%s')=%i instead of %i" %\
(X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])
if __name__ == "__main__":
X_orig, Y_orig = load_sanders_data()
classes = np.unique(Y_orig)
for c in classes:
print "#%s: %i" % (c, sum(Y_orig == c))
print "== Pos vs. neg =="
pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative")
X = X_orig[pos_neg]
Y = Y_orig[pos_neg]
Y = tweak_labels(Y, ["positive"])
train_model(create_ngram_model, X, Y, name="pos vs neg", plot=True)
print "== Pos/neg vs. irrelevant/neutral =="
X = X_orig
Y = tweak_labels(Y_orig, ["positive", "negative"])
train_model(create_ngram_model, X, Y, name="sent vs rest", plot=True)
print "== Pos vs. rest =="
X = X_orig
Y = tweak_labels(Y_orig, ["positive"])
train_model(create_ngram_model, X, Y, name="pos vs rest", plot=True)
print "== Neg vs. rest =="
X = X_orig
Y = tweak_labels(Y_orig, ["negative"])
train_model(create_ngram_model, X, Y, name="neg vs rest", plot=True)
print "time spent:", time.time() - start_time
import time
start_time = time.time()
import numpy as np
from sklearn.metrics import precision_recall_curve, roc_curve, auc
from sklearn.cross_validation import ShuffleSplit
from utils import plot_pr
from utils import load_sanders_data
from utils import tweak_labels
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB
phase = "02"
def create_ngram_model(params=None):
tfidf_ngrams = TfidfVectorizer(ngram_range=(1, 3),
analyzer="word", binary=False)
clf = MultinomialNB()
pipeline = Pipeline([('vect', tfidf_ngrams), ('clf', clf)])
if params:
pipeline.set_params(**params)
return pipeline
def grid_search_model(clf_factory, X, Y):
cv = ShuffleSplit(
n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)],
vect__min_df=[1, 2],
vect__stop_words=[None, "english"],
vect__smooth_idf=[False, True],
vect__use_idf=[False, True],
vect__sublinear_tf=[False, True],
vect__binary=[False, True],
clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1],
)
grid_search = GridSearchCV(clf_factory(),
param_grid=param_grid,
cv=cv,
score_func=f1_score,
verbose=10)
grid_search.fit(X, Y)
clf = grid_search.best_estimator_
print clf
return clf
def train_model(clf, X, Y, name="NB ngram", plot=False):
# create it again for plotting
cv = ShuffleSplit(
n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
train_errors = []
test_errors = []
scores = []
pr_scores = []
precisions, recalls, thresholds = [], [], []
for train, test in cv:
X_train, y_train = X[train], Y[train]
X_test, y_test = X[test], Y[test]
clf.fit(X_train, y_train)
train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)
train_errors.append(1 - train_score)
test_errors.append(1 - test_score)
scores.append(test_score)
proba = clf.predict_proba(X_test)
fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
precision, recall, pr_thresholds = precision_recall_curve(
y_test, proba[:, 1])
pr_scores.append(auc(recall, precision))
precisions.append(precision)
recalls.append(recall)
thresholds.append(pr_thresholds)
if plot:
scores_to_sort = pr_scores
median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
plot_pr(pr_scores[median], name, phase, precisions[median],
recalls[median], label=name)
summary = (np.mean(scores), np.std(scores),
np.mean(pr_scores), np.std(pr_scores))
print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary
return np.mean(train_errors), np.mean(test_errors)
def print_incorrect(clf, X, Y):
Y_hat = clf.predict(X)
wrong_idx = Y_hat != Y
X_wrong = X[wrong_idx]
Y_wrong = Y[wrong_idx]
Y_hat_wrong = Y_hat[wrong_idx]
for idx in xrange(len(X_wrong)):
print "clf.predict('%s')=%i instead of %i" %\
(X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])
def get_best_model():
best_params = dict(vect__ngram_range=(1, 2),
vect__min_df=1,
vect__stop_words=None,
vect__smooth_idf=False,
vect__use_idf=False,
vect__sublinear_tf=True,
vect__binary=False,
clf__alpha=0.01,
)
best_clf = create_ngram_model(best_params)
return best_clf
if __name__ == "__main__":
X_orig, Y_orig = load_sanders_data()
classes = np.unique(Y_orig)
for c in classes:
print "#%s: %i" % (c, sum(Y_orig == c))
print "== Pos vs. neg =="
pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative")
X = X_orig[pos_neg]
Y = Y_orig[pos_neg]
Y = tweak_labels(Y, ["positive"])
train_model(get_best_model(), X, Y, name="pos vs neg", plot=True)
print "== Pos/neg vs. irrelevant/neutral =="
X = X_orig
Y = tweak_labels(Y_orig, ["positive", "negative"])
# best_clf = grid_search_model(create_ngram_model, X, Y, name="sent vs
# rest", plot=True)
train_model(get_best_model(), X, Y, name="pos vs neg", plot=True)
print "== Pos vs. rest =="
X = X_orig
Y = tweak_labels(Y_orig, ["positive"])
train_model(get_best_model(), X, Y, name="pos vs rest",
plot=True)
print "== Neg vs. rest =="
X = X_orig
Y = tweak_labels(Y_orig, ["negative"])
train_model(get_best_model(), X, Y, name="neg vs rest",
plot=True)
print "time spent:", time.time() - start_time
import time
start_time = time.time()
import re
import numpy as np
from sklearn.metrics import precision_recall_curve, roc_curve, auc
from sklearn.cross_validation import ShuffleSplit
from sklearn.pipeline import Pipeline
from utils import plot_pr
from utils import load_sanders_data
from utils import tweak_labels
from utils import log_false_positives
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from utils import load_sent_word_net
sent_word_net = load_sent_word_net()
phase = "03"
emo_repl = {
# positive emoticons
"<3": " good ",
":d": " good ", # :D in lower case
":dd": " good ", # :DD in lower case
"8)": " good ",
":-)": " good ",
":)": " good ",
";)": " good ",
"(-:": " good ",
"(:": " good ",
# negative emoticons:
":/": " bad ",
":>": " sad ",
":')": " sad ",
":-(": " bad ",
":(": " bad ",
":S": " bad ",
":-S": " bad ",
}
emo_repl_order = [k for (k_len,k) in reversed(sorted([(len(k),k) for k in emo_repl.keys()]))]
re_repl = {
r"\br\b": "are",
r"\bu\b": "you",
r"\bhaha\b": "ha",
r"\bhahaha\b": "ha",
r"\bdon't\b": "do not",
r"\bdoesn't\b": "does not",
r"\bdidn't\b": "did not",
r"\bhasn't\b": "has not",
r"\bhaven't\b": "have not",
r"\bhadn't\b": "had not",
r"\bwon't\b": "will not",
r"\bwouldn't\b": "would not",
r"\bcan't\b": "can not",
r"\bcannot\b": "can not",
}
def create_ngram_model(params=None):
def preprocessor(tweet):
global emoticons_replaced
tweet = tweet.lower()
for k in emo_repl_order:
tweet = tweet.replace(k, emo_repl[k])
for r, repl in re_repl.iteritems():
tweet = re.sub(r, repl, tweet)
return tweet
tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,
analyzer="word")
clf = MultinomialNB()
pipeline = Pipeline([('tfidf', tfidf_ngrams), ('clf', clf)])
if params:
pipeline.set_params(**params)
return pipeline
def train_model(clf, X, Y, name="NB ngram", plot=False):
# create it again for plotting
cv = ShuffleSplit(
n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
train_errors = []
test_errors = []
scores = []
pr_scores = []
precisions, recalls, thresholds = [], [], []
clfs = [] # just to later get the median
for train, test in cv:
X_train, y_train = X[train], Y[train]
X_test, y_test = X[test], Y[test]
clf.fit(X_train, y_train)
clfs.append(clf)
train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)
train_errors.append(1 - train_score)
test_errors.append(1 - test_score)
scores.append(test_score)
proba = clf.predict_proba(X_test) #使用predict_proba进行预测,结果是可能性
fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
precision, recall, pr_thresholds = precision_recall_curve(
y_test, proba[:, 1])
pr_scores.append(auc(recall, precision))
precisions.append(precision)
recalls.append(recall)
thresholds.append(pr_thresholds)
if plot:
scores_to_sort = pr_scores
median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
plot_pr(pr_scores[median], name, phase, precisions[median],
recalls[median], label=name)
log_false_positives(clfs[median], X_test, y_test, name)
summary = (np.mean(scores), np.std(scores),
np.mean(pr_scores), np.std(pr_scores))
print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary
return np.mean(train_errors), np.mean(test_errors)
def print_incorrect(clf, X, Y):
Y_hat = clf.predict(X)
wrong_idx = Y_hat != Y
X_wrong = X[wrong_idx]
Y_wrong = Y[wrong_idx]
Y_hat_wrong = Y_hat[wrong_idx]
for idx in xrange(len(X_wrong)):
print "clf.predict('%s')=%i instead of %i" %\
(X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])
def get_best_model():
best_params = dict(tfidf__ngram_range=(1, 2),
tfidf__min_df=1,
tfidf__stop_words=None,
tfidf__smooth_idf=False,
tfidf__use_idf=False,
tfidf__sublinear_tf=True,
tfidf__binary=False,
clf__alpha=0.01,
)
best_clf = create_ngram_model(best_params)
return best_clf
if __name__ == "__main__":
X_orig, Y_orig = load_sanders_data()
classes = np.unique(Y_orig)
for c in classes:
print "#%s: %i" % (c, sum(Y_orig == c))
print "== Pos vs. neg =="
pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative")
X = X_orig[pos_neg]
Y = Y_orig[pos_neg]
Y = tweak_labels(Y, ["positive"])
train_model(get_best_model(), X, Y, name="pos vs neg", plot=True)
print "== Pos/neg vs. irrelevant/neutral =="
X = X_orig
Y = tweak_labels(Y_orig, ["positive", "negative"])
# best_clf = grid_search_model(create_union_model, X, Y, name="sent vs
# rest", plot=True)
train_model(get_best_model(), X, Y, name="pos+neg vs rest", plot=True)
print "== Pos vs. rest =="
X = X_orig
Y = tweak_labels(Y_orig, ["positive"])
train_model(get_best_model(), X, Y, name="pos vs rest",
plot=True)
print "== Neg vs. rest =="
X = X_orig
Y = tweak_labels(Y_orig, ["negative"])
train_model(get_best_model(), X, Y, name="neg vs rest",
plot=True)
print "time spent:", time.time() - start_time
# This script trains tries to tweak hyperparameters to improve P/R AUC
#
import time
start_time = time.time()
import nltk
import numpy as np
from sklearn.metrics import precision_recall_curve, roc_curve, auc
from sklearn.cross_validation import ShuffleSplit
from utils import plot_pr
from utils import load_sanders_data
from utils import tweak_labels
from utils import log_false_positives
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion #特征联合,并行
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.base import BaseEstimator
from sklearn.naive_bayes import MultinomialNB
from utils import load_sent_word_net
sent_word_net = load_sent_word_net()
import json
poscache_filename = "poscache.json"
try:
poscache = json.load(open(poscache_filename, "r"))
except IOError:
poscache = {}
class StructCounter(BaseEstimator):
def get_feature_names(self):
return np.array(['sent_neut', 'sent_pos', 'sent_neg',
'nouns', 'adjectives', 'verbs', 'adverbs',
'allcaps', 'exclamation', 'question', 'hashtag', 'mentioning'])
def fit(self, documents, y=None):
return self
def _get_sentiments(self, d):
# http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
#import pdb;pdb.set_trace()
sent = tuple(d.split())
if poscache is not None:
if d in poscache:
tagged = poscache[d]
else:
poscache[d] = tagged = nltk.pos_tag(sent)
else:
tagged = nltk.pos_tag(sent)
pos_vals = []
neg_vals = []
nouns = 0.
adjectives = 0.
verbs = 0.
adverbs = 0.
for w,t in tagged:
p, n = 0,0
sent_pos_type = None
if t.startswith("NN"):
sent_pos_type = "n"
nouns += 1
elif t.startswith("JJ"):
sent_pos_type = "a"
adjectives += 1
elif t.startswith("VB"):
sent_pos_type = "v"
verbs += 1
elif t.startswith("RB"):
sent_pos_type = "r"
adverbs += 1
if sent_pos_type is not None:
sent_word = "%s/%s"%(sent_pos_type, w)
if sent_word in sent_word_net:
p,n = sent_word_net[sent_word]
pos_vals.append(p)
neg_vals.append(n)
l = len(sent)
avg_pos_val = np.mean(pos_vals)
avg_neg_val = np.mean(neg_vals)
return [1-avg_pos_val-avg_neg_val, avg_pos_val, avg_neg_val,
nouns/l, adjectives/l, verbs/l, adverbs/l]
def transform(self, documents):
obj_val, pos_val, neg_val, nouns, adjectives, verbs, adverbs = np.array([self._get_sentiments(d) for d in documents]).T
allcaps = []
exclamation = []
question = []
hashtag = []
mentioning = []
for d in documents:
#import pdb;pdb.set_trace()
allcaps.append(np.sum([t.isupper() for t in d.split() if len(t)>2]))
exclamation.append(d.count("!"))
question.append(d.count("?"))
hashtag.append(d.count("#"))
mentioning.append(d.count("@"))
result = np.array([obj_val, pos_val, neg_val, nouns, adjectives, verbs, adverbs, allcaps,
exclamation, question, hashtag, mentioning]).T
return result
def create_union_model(params=None):
def preprocessor(tweet):
global emoticons_replaced
#return tweet.lower()
repl = {
# positive emoticons
"<3": " good ",
":D": " good ",
"8)": " good ",
":-)": " good ",
":)": " good ",
";)": " good ",
";-)": " good ",
# negative emoticons:
":/": " bad ",
":>": " sad ",
":-(": " bad ",
":(": " bad ",
":S": " bad ",
":-S": " bad ",
}
for a,b in repl.iteritems():
tweet = tweet.replace(a,b)
return tweet.lower()
tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,
analyzer="word")
struct_stats = StructCounter()
all_features = FeatureUnion([('struct', struct_stats), ('tfidf', tfidf_ngrams)]) #注意FeatureUNion, 是特征并列处理,然后组合在一起,简单的lbind
#all_features = FeatureUnion([('tfidf', tfidf_ngrams)])
#all_features = FeatureUnion([('struct', struct_stats)])
clf = MultinomialNB()
pipeline = Pipeline([('all', all_features), ('clf', clf)])
if params:
pipeline.set_params(**params)
return pipeline
def grid_search_model(clf_factory, X, Y):
cv = ShuffleSplit(
n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
param_grid = dict(all__tfidf__ngram_range=[(1, 1), (1, 2), (1, 3)],
all__tfidf__min_df=[1, 2],
all__tfidf__stop_words=[None, "english"],
all__tfidf__smooth_idf=[False, True],
all__tfidf__use_idf=[False, True],
all__tfidf__sublinear_tf=[False, True],
all__tfidf__binary=[False, True],
clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1],
)
grid_search = GridSearchCV(clf_factory(),
param_grid=param_grid,
cv=cv,
score_func=f1_score,
verbose=10)
grid_search.fit(X, Y)
clf = grid_search.best_estimator_ # grid_search找出最好的模型
print clf
return clf
def train_model(clf, X, Y, name="NB ngram", plot=False):
# create it again for plotting
cv = ShuffleSplit(
n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
train_errors = []
test_errors = []
scores = []
pr_scores = []
precisions, recalls, thresholds = [], [], []
clfs = [] # just to later get the median
for train, test in cv:
X_train, y_train = X[train], Y[train]
X_test, y_test = X[test], Y[test]
clf.fit(X_train, y_train)
clfs.append(clf)
train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)
train_errors.append(1 - train_score)
test_errors.append(1 - test_score)
scores.append(test_score)
proba = clf.predict_proba(X_test)
fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
precision, recall, pr_thresholds = precision_recall_curve(
y_test, proba[:, 1])
pr_scores.append(auc(recall, precision))
precisions.append(precision)
recalls.append(recall)
thresholds.append(pr_thresholds)
if plot:
scores_to_sort = pr_scores
median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
plot_pr(pr_scores[median], name, precisions[median],
recalls[median], label=name)
log_false_positives(clfs[median], X_test, y_test, name)
summary = (np.mean(scores), np.std(scores),
np.mean(pr_scores), np.std(pr_scores))
print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary
return np.mean(train_errors), np.mean(test_errors)
def print_incorrect(clf, X, Y):
Y_hat = clf.predict(X)
wrong_idx = Y_hat != Y
X_wrong = X[wrong_idx]
Y_wrong = Y[wrong_idx]
Y_hat_wrong = Y_hat[wrong_idx]
for idx in xrange(len(X_wrong)):
print "clf.predict('%s')=%i instead of %i" %\
(X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])
def get_best_model():
best_params = dict(all__tfidf__ngram_range=(1, 2),
all__tfidf__min_df=1,
all__tfidf__stop_words=None,
all__tfidf__smooth_idf=False,
all__tfidf__use_idf=False,
all__tfidf__sublinear_tf=True,
all__tfidf__binary=False,
clf__alpha=0.01,
)
best_clf = create_union_model(best_params)
return best_clf
if __name__ == "__main__":
X_orig, Y_orig = load_sanders_data()
#from sklearn.utils import shuffle
#print "shuffle, sample"
#X_orig, Y_orig = shuffle(X_orig, Y_orig)
#X_orig = X_orig[:100,]
#Y_orig = Y_orig[:100,]
classes = np.unique(Y_orig)
for c in classes:
print "#%s: %i" % (c, sum(Y_orig == c))
print "== Pos vs. neg =="
pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative")
X = X_orig[pos_neg]
Y = Y_orig[pos_neg]
Y = tweak_labels(Y, ["positive"])
grid_search_model(create_union_model, X, Y)
print "== Pos/neg vs. irrelevant/neutral =="
X = X_orig
Y = tweak_labels(Y_orig, ["positive", "negative"])
# best_clf = grid_search_model(create_union_model, X, Y, name="sent vs
# rest", plot=True)
grid_search_model(create_union_model, X, Y)
print "== Pos vs. rest =="
X = X_orig
Y = tweak_labels(Y_orig, ["positive"])
grid_search_model(create_union_model, X, Y)
print "== Neg vs. rest =="
X = X_orig
Y = tweak_labels(Y_orig, ["negative"])
grid_search_model(create_union_model, X, Y)
print "time spent:", time.time() - start_time
json.dump(poscache, open(poscache_filename, "w"))
#
# This script trains tries to tweak hyperparameters to improve P/R AUC
#
import time
start_time = time.time()
import re
import nltk
import numpy as np
from sklearn.metrics import precision_recall_curve, roc_curve, auc
from sklearn.cross_validation import ShuffleSplit
from utils import plot_pr
from utils import load_sanders_data
from utils import tweak_labels
from utils import log_false_positives
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.base import BaseEstimator
from sklearn.naive_bayes import MultinomialNB
from utils import load_sent_word_net
sent_word_net = load_sent_word_net()
phase = "04"
import json
poscache_filename = "poscache.json"
try: # try, except 模式
poscache = json.load(open(poscache_filename, "r"))
except IOError:
poscache = {}
class LinguisticVectorizer(BaseEstimator):
def get_feature_names(self):
return np.array(['sent_neut', 'sent_pos', 'sent_neg',
'nouns', 'adjectives', 'verbs', 'adverbs',
'allcaps', 'exclamation', 'question'])
def fit(self, documents, y=None):
return self
def _get_sentiments(self, d):
# http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
#import pdb;pdb.set_trace()
sent = tuple(nltk.word_tokenize(d)) #tuple类型
if poscache is not None:
if d in poscache:
tagged = poscache[d]
else:
poscache[d] = tagged = nltk.pos_tag(sent)
else:
tagged = nltk.pos_tag(sent)
pos_vals = []
neg_vals = []
nouns = 0.
adjectives = 0.
verbs = 0.
adverbs = 0.
for w,t in tagged:
p, n = 0,0
sent_pos_type = None
if t.startswith("NN"):
sent_pos_type = "n"
nouns += 1
elif t.startswith("JJ"):
sent_pos_type = "a"
adjectives += 1
elif t.startswith("VB"):
sent_pos_type = "v"
verbs += 1
elif t.startswith("RB"):
sent_pos_type = "r"
adverbs += 1
if sent_pos_type is not None:
sent_word = "%s/%s"%(sent_pos_type, w)
if sent_word in sent_word_net:
p,n = sent_word_net[sent_word]
pos_vals.append(p)
neg_vals.append(n)
l = len(sent)
avg_pos_val = np.mean(pos_vals) #注意
avg_neg_val = np.mean(neg_vals)
#import pdb;pdb.set_trace()
return [1-avg_pos_val-avg_neg_val, avg_pos_val, avg_neg_val,
nouns/l, adjectives/l, verbs/l, adverbs/l]
def transform(self, documents):
obj_val, pos_val, neg_val, nouns, adjectives, verbs, adverbs = np.array([self._get_sentiments(d) for d in documents]).T
allcaps = []
exclamation = []
question = []
for d in documents:
allcaps.append(np.sum([t.isupper() for t in d.split() if len(t)>2]))
exclamation.append(d.count("!"))
question.append(d.count("?"))
result = np.array([obj_val, pos_val, neg_val, nouns, adjectives, verbs, adverbs, allcaps,
exclamation, question]).T
return result
emo_repl = {
# positive emoticons
"<3": " good ",
":d": " good ", # :D in lower case
":dd": " good ", # :DD in lower case
"8)": " good ",
":-)": " good ",
":)": " good ",
";)": " good ",
"(-:": " good ",
"(:": " good ",
# negative emoticons:
":/": " bad ",
":>": " sad ",
":')": " sad ",
":-(": " bad ",
":(": " bad ",
":S": " bad ",
":-S": " bad ",
}
emo_repl_order = [k for (k_len,k) in reversed(sorted([(len(k),k) for k in emo_repl.keys()]))]
re_repl = {
r"\br\b": "are",
r"\bu\b": "you",
r"\bhaha\b": "ha",
r"\bhahaha\b": "ha",
r"\bdon't\b": "do not",
r"\bdoesn't\b": "does not",
r"\bdidn't\b": "did not",
r"\bhasn't\b": "has not",
r"\bhaven't\b": "have not",
r"\bhadn't\b": "had not",
r"\bwon't\b": "will not",
r"\bwouldn't\b": "would not",
r"\bcan't\b": "can not",
r"\bcannot\b": "can not",
}
def create_union_model(params=None):
def preprocessor(tweet):
tweet = tweet.lower()
for k in emo_repl_order:
tweet = tweet.replace(k, emo_repl[k])
for r, repl in re_repl.iteritems():
tweet = re.sub(r, repl, tweet)
return tweet.replace("-", " ").replace("_", " ")
tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,
analyzer="word")
ling_stats = LinguisticVectorizer()
all_features = FeatureUnion([('ling', ling_stats), ('tfidf', tfidf_ngrams)])
#all_features = FeatureUnion([('tfidf', tfidf_ngrams)])
#all_features = FeatureUnion([('ling', ling_stats)])
clf = MultinomialNB()
pipeline = Pipeline([('all', all_features), ('clf', clf)])
if params:
pipeline.set_params(**params)
return pipeline
def __grid_search_model(clf_factory, X, Y):
cv = ShuffleSplit(
n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)],
vect__min_df=[1, 2],
vect__smooth_idf=[False, True],
vect__use_idf=[False, True],
vect__sublinear_tf=[False, True],
vect__binary=[False, True],
clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1],
)
grid_search = GridSearchCV(clf_factory(),
param_grid=param_grid,
cv=cv,
score_func=f1_score,
verbose=10)
grid_search.fit(X, Y)
clf = grid_search.best_estimator_
print clf
return clf
def train_model(clf, X, Y, name="NB ngram", plot=False):
# create it again for plotting
cv = ShuffleSplit(
n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
train_errors = []
test_errors = []
scores = []
pr_scores = []
precisions, recalls, thresholds = [], [], []
clfs = [] # just to later get the median
for train, test in cv:
X_train, y_train = X[train], Y[train]
X_test, y_test = X[test], Y[test]
clf.fit(X_train, y_train)
clfs.append(clf)
train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)
train_errors.append(1 - train_score)
test_errors.append(1 - test_score)
scores.append(test_score)
proba = clf.predict_proba(X_test)
fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
precision, recall, pr_thresholds = precision_recall_curve(
y_test, proba[:, 1])
pr_scores.append(auc(recall, precision))
precisions.append(precision)
recalls.append(recall)
thresholds.append(pr_thresholds)
if plot:
scores_to_sort = pr_scores
median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
plot_pr(pr_scores[median], name, phase, precisions[median],
recalls[median], label=name)
log_false_positives(clfs[median], X_test, y_test, name)
summary = (np.mean(scores), np.std(scores),
np.mean(pr_scores), np.std(pr_scores))
print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary
return np.mean(train_errors), np.mean(test_errors)
def print_incorrect(clf, X, Y):
Y_hat = clf.predict(X)
wrong_idx = Y_hat != Y
X_wrong = X[wrong_idx]
Y_wrong = Y[wrong_idx]
Y_hat_wrong = Y_hat[wrong_idx]
for idx in xrange(len(X_wrong)):
print "clf.predict('%s')=%i instead of %i" %\
(X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])
def get_best_model():
best_params = dict(all__tfidf__ngram_range=(1, 2),
all__tfidf__min_df=1,
all__tfidf__stop_words=None,
all__tfidf__smooth_idf=False,
all__tfidf__use_idf=False,
all__tfidf__sublinear_tf=True,
all__tfidf__binary=False,
clf__alpha=0.01,
)
best_clf = create_union_model(best_params)
return best_clf
if __name__ == "__main__":
X_orig, Y_orig = load_sanders_data()
#from sklearn.utils import shuffle
#print "shuffle, sample"
#X_orig, Y_orig = shuffle(X_orig, Y_orig)
#X_orig = X_orig[:100,]
#Y_orig = Y_orig[:100,]
classes = np.unique(Y_orig)
for c in classes:
print "#%s: %i" % (c, sum(Y_orig == c))
print "== Pos vs. neg =="
pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative")
X = X_orig[pos_neg]
Y = Y_orig[pos_neg]
Y = tweak_labels(Y, ["positive"])
train_model(get_best_model(), X, Y, name="pos vs neg", plot=True)
print "== Pos/neg vs. irrelevant/neutral =="
X = X_orig
Y = tweak_labels(Y_orig, ["positive", "negative"])
# best_clf = grid_search_model(create_union_model, X, Y, name="sent vs
# rest", plot=True)
train_model(get_best_model(), X, Y, name="pos+neg vs rest", plot=True)
print "== Pos vs. rest =="
X = X_orig
Y = tweak_labels(Y_orig, ["positive"])
train_model(get_best_model(), X, Y, name="pos vs rest",
plot=True)
print "== Neg vs. rest =="
X = X_orig
Y = tweak_labels(Y_orig, ["negative"])
train_model(get_best_model(), X, Y, name="neg vs rest",
plot=True)
print "time spent:", time.time() - start_time
json.dump(poscache, open(poscache_filename, "w")) #dump 序列化
#
# This script trains tries to tweak hyperparameters to improve P/R AUC
#
import time
start_time = time.time()
import re
import nltk
import numpy as np
from sklearn.metrics import precision_recall_curve, roc_curve, auc
from sklearn.cross_validation import ShuffleSplit
from utils import plot_pr
from utils import load_sanders_data
from utils import tweak_labels
from utils import log_false_positives
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.base import BaseEstimator
from sklearn.naive_bayes import MultinomialNB
from utils import load_sent_word_net
sent_word_net = load_sent_word_net()
import json
poscache_filename = "poscache.json"
try:
poscache = json.load(open(poscache_filename, "r"))
except IOError:
poscache = {}
class LinguisticVectorizer(BaseEstimator):
def get_feature_names(self):
return np.array(['sent_neut', 'sent_pos', 'sent_neg',
'nouns', 'adjectives', 'verbs', 'adverbs',
'allcaps', 'exclamation', 'question'])
def fit(self, documents, y=None):
return self
def _get_sentiments(self, d):
# http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
#import pdb;pdb.set_trace()
sent = tuple(nltk.word_tokenize(d))
if poscache is not None:
if d in poscache:
tagged = poscache[d]
else:
poscache[d] = tagged = nltk.pos_tag(sent)
else:
tagged = nltk.pos_tag(sent)
pos_vals = []
neg_vals = []
nouns = 0.
adjectives = 0.
verbs = 0.
adverbs = 0.
for w,t in tagged:
p, n = 0,0
sent_pos_type = None
if t.startswith("NN"):
sent_pos_type = "n"
nouns += 1
elif t.startswith("JJ"):
sent_pos_type = "a"
adjectives += 1
elif t.startswith("VB"):
sent_pos_type = "v"
verbs += 1
elif t.startswith("RB"):
sent_pos_type = "r"
adverbs += 1
if sent_pos_type is not None:
sent_word = "%s/%s"%(sent_pos_type, w)
if sent_word in sent_word_net:
p,n = sent_word_net[sent_word]
pos_vals.append(p)
neg_vals.append(n)
l = len(sent)
avg_pos_val = np.max(pos_vals)
avg_neg_val = np.max(neg_vals)
return [max(0,1-avg_pos_val-avg_neg_val), avg_pos_val, avg_neg_val,
nouns/l, adjectives/l, verbs/l, adverbs/l]
def transform(self, documents):
obj_val, pos_val, neg_val, nouns, adjectives, verbs, adverbs = np.array([self._get_sentiments(d) for d in documents]).T
allcaps = []
exclamation = []
question = []
for d in documents:
allcaps.append(np.sum([t.isupper() for t in d.split() if len(t)>2]))
exclamation.append(d.count("!"))
question.append(d.count("?"))
result = np.array([obj_val, pos_val, neg_val, nouns, adjectives, verbs, adverbs, allcaps,
exclamation, question]).T
return result
emo_repl = {
# positive emoticons
"<3": " good ",
":d": " good ", # :D in lower case
":dd": " good ", # :DD in lower case
"8)": " good ",
":-)": " good ",
":)": " good ",
";)": " good ",
"(-:": " good ",
"(:": " good ",
# negative emoticons:
":/": " bad ",
":>": " sad ",
":')": " sad ",
":-(": " bad ",
":(": " bad ",
":S": " bad ",
":-S": " bad ",
}
emo_repl_order = [k for (k_len,k) in reversed(sorted([(len(k),k) for k in emo_repl.keys()]))]
re_repl = {
r"\br\b": "are",
r"\bu\b": "you",
r"\bhaha\b": "ha",
r"\bhahaha\b": "ha",
r"\bdon't\b": "do not",
r"\bdoesn't\b": "does not",
r"\bdidn't\b": "did not",
r"\bhasn't\b": "has not",
r"\bhaven't\b": "have not",
r"\bhadn't\b": "had not",
r"\bwon't\b": "will not",
r"\bwouldn't\b": "would not",
r"\bcan't\b": "can not",
r"\bcannot\b": "can not",
}
def create_union_model(params=None):
def preprocessor(tweet):
tweet = tweet.lower()
for k in emo_repl_order:
tweet = tweet.replace(k, emo_repl[k])
for r, repl in re_repl.iteritems():
tweet = re.sub(r, repl, tweet)
return tweet.replace("-", " ").replace("_", " ")
tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,
analyzer="word")
ling_stats = LinguisticVectorizer()
all_features = FeatureUnion([('ling', ling_stats), ('tfidf', tfidf_ngrams)])
#all_features = FeatureUnion([('tfidf', tfidf_ngrams)])
#all_features = FeatureUnion([('ling', ling_stats)])
clf = MultinomialNB()
pipeline = Pipeline([('all', all_features), ('clf', clf)])
if params:
pipeline.set_params(**params)
return pipeline
def __grid_search_model(clf_factory, X, Y):
cv = ShuffleSplit(
n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)],
vect__min_df=[1, 2],
vect__smooth_idf=[False, True],
vect__use_idf=[False, True],
vect__sublinear_tf=[False, True],
vect__binary=[False, True],
clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1],
)
grid_search = GridSearchCV(clf_factory(),
param_grid=param_grid,
cv=cv,
score_func=f1_score,
verbose=10)
grid_search.fit(X, Y)
clf = grid_search.best_estimator_
print clf
return clf
def train_model(clf, X, Y, name="NB ngram", plot=False):
# create it again for plotting
cv = ShuffleSplit(
n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
train_errors = []
test_errors = []
scores = []
pr_scores = []
precisions, recalls, thresholds = [], [], []
clfs = [] # just to later get the median
for train, test in cv:
X_train, y_train = X[train], Y[train]
X_test, y_test = X[test], Y[test]
clf.fit(X_train, y_train)
clfs.append(clf)
train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)
train_errors.append(1 - train_score)
test_errors.append(1 - test_score)
scores.append(test_score)
proba = clf.predict_proba(X_test)
fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
precision, recall, pr_thresholds = precision_recall_curve(
y_test, proba[:, 1])
pr_scores.append(auc(recall, precision))
precisions.append(precision)
recalls.append(recall)
thresholds.append(pr_thresholds)
if plot:
scores_to_sort = pr_scores
median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
plot_pr(pr_scores[median], name, precisions[median],
recalls[median], label=name)
log_false_positives(clfs[median], X_test, y_test, name)
summary = (np.mean(scores), np.std(scores),
np.mean(pr_scores), np.std(pr_scores))
print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary
return np.mean(train_errors), np.mean(test_errors)
def print_incorrect(clf, X, Y):
Y_hat = clf.predict(X)
wrong_idx = Y_hat != Y
X_wrong = X[wrong_idx]
Y_wrong = Y[wrong_idx]
Y_hat_wrong = Y_hat[wrong_idx]
for idx in xrange(len(X_wrong)):
print "clf.predict('%s')=%i instead of %i" %\
(X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])
def get_best_model():
best_params = dict(all__tfidf__ngram_range=(1, 2),
all__tfidf__min_df=1,
all__tfidf__stop_words=None,
all__tfidf__smooth_idf=False,
all__tfidf__use_idf=False,
all__tfidf__sublinear_tf=True,
all__tfidf__binary=False,
clf__alpha=0.01,
)
best_clf = create_union_model(best_params)
return best_clf
if __name__ == "__main__":
X_orig, Y_orig = load_sanders_data()
#from sklearn.utils import shuffle
#print "shuffle, sample"
#X_orig, Y_orig = shuffle(X_orig, Y_orig)
#X_orig = X_orig[:100,]
#Y_orig = Y_orig[:100,]
classes = np.unique(Y_orig)
for c in classes:
print "#%s: %i" % (c, sum(Y_orig == c))
print "== Pos vs. neg =="
pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative")
X = X_orig[pos_neg]
Y = Y_orig[pos_neg]
Y = tweak_labels(Y, ["positive"])
train_model(get_best_model(), X, Y, name="pos vs neg", plot=True)
print "== Pos/neg vs. irrelevant/neutral =="
X = X_orig
Y = tweak_labels(Y_orig, ["positive", "negative"])
# best_clf = grid_search_model(create_union_model, X, Y, name="sent vs
# rest", plot=True)
train_model(get_best_model(), X, Y, name="pos+neg vs rest", plot=True)
print "== Pos vs. rest =="
X = X_orig
Y = tweak_labels(Y_orig, ["positive"])
train_model(get_best_model(), X, Y, name="pos vs rest",
plot=True)
print "== Neg vs. rest =="
X = X_orig
Y = tweak_labels(Y_orig, ["negative"])
train_model(get_best_model(), X, Y, name="neg vs rest",
plot=True)
print "time spent:", time.time() - start_time
json.dump(poscache, open(poscache_filename, "w"))
This script trains tries to tweak hyperparameters to improve P/R AUC
#
import time
start_time = time.time()
import re
import nltk
import numpy as np
from sklearn.metrics import precision_recall_curve, roc_curve, auc
from sklearn.cross_validation import ShuffleSplit
from utils import plot_pr
from utils import load_sanders_data
from utils import tweak_labels
from utils import log_false_positives
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.base import BaseEstimator
from sklearn.naive_bayes import BernoulliNB
from utils import load_sent_word_net
sent_word_net = load_sent_word_net()
import json
poscache_filename = "poscache.json"
try:
poscache = json.load(open(poscache_filename, "r"))
except IOError:
poscache = {}
class LinguisticVectorizer(BaseEstimator):
def get_feature_names(self):
return np.array(['sent_neut', 'sent_pos', 'sent_neg',
'nouns', 'adjectives', 'verbs', 'adverbs',
'allcaps', 'exclamation', 'question'])
def fit(self, documents, y=None):
return self
def _get_sentiments(self, d):
# http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
#import pdb;pdb.set_trace()
sent = tuple(nltk.word_tokenize(d))
if poscache is not None:
if d in poscache:
tagged = poscache[d]
else:
poscache[d] = tagged = nltk.pos_tag(sent)
else:
tagged = nltk.pos_tag(sent)
pos_vals = []
neg_vals = []
nouns = 0.
adjectives = 0.
verbs = 0.
adverbs = 0.
for w,t in tagged:
p, n = 0,0
sent_pos_type = None
if t.startswith("NN"):
sent_pos_type = "n"
nouns += 1
elif t.startswith("JJ"):
sent_pos_type = "a"
adjectives += 1
elif t.startswith("VB"):
sent_pos_type = "v"
verbs += 1
elif t.startswith("RB"):
sent_pos_type = "r"
adverbs += 1
if sent_pos_type is not None:
sent_word = "%s/%s"%(sent_pos_type, w)
if sent_word in sent_word_net:
p,n = sent_word_net[sent_word]
pos_vals.append(p)
neg_vals.append(n)
l = len(sent)
avg_pos_val = np.mean(pos_vals)
avg_neg_val = np.mean(neg_vals)
#import pdb;pdb.set_trace()
return [1-avg_pos_val-avg_neg_val, avg_pos_val, avg_neg_val,
nouns/l, adjectives/l, verbs/l, adverbs/l]
def transform(self, documents):
obj_val, pos_val, neg_val, nouns, adjectives, verbs, adverbs = np.array([self._get_sentiments(d) for d in documents]).T
allcaps = []
exclamation = []
question = []
for d in documents:
allcaps.append(np.sum([t.isupper() for t in d.split() if len(t)>2]))
exclamation.append(d.count("!"))
question.append(d.count("?"))
result = np.array([obj_val, pos_val, neg_val, nouns, adjectives, verbs, adverbs, allcaps,
exclamation, question]).T
return result
emo_repl = {
# positive emoticons
"<3": " good ",
":d": " good ", # :D in lower case
":dd": " good ", # :DD in lower case
"8)": " good ",
":-)": " good ",
":)": " good ",
";)": " good ",
"(-:": " good ",
"(:": " good ",
# negative emoticons:
":/": " bad ",
":>": " sad ",
":')": " sad ",
":-(": " bad ",
":(": " bad ",
":S": " bad ",
":-S": " bad ",
}
emo_repl_order = [k for (k_len,k) in reversed(sorted([(len(k),k) for k in emo_repl.keys()]))]
re_repl = {
r"\br\b": "are",
r"\bu\b": "you",
r"\bhaha\b": "ha",
r"\bhahaha\b": "ha",
r"\bdon't\b": "do not",
r"\bdoesn't\b": "does not",
r"\bdidn't\b": "did not",
r"\bhasn't\b": "has not",
r"\bhaven't\b": "have not",
r"\bhadn't\b": "had not",
r"\bwon't\b": "will not",
r"\bwouldn't\b": "would not",
r"\bcan't\b": "can not",
r"\bcannot\b": "can not",
}
def create_union_model(params=None):
def preprocessor(tweet):
tweet = tweet.lower()
for k in emo_repl_order:
tweet = tweet.replace(k, emo_repl[k])
for r, repl in re_repl.iteritems():
tweet = re.sub(r, repl, tweet)
return tweet.replace("-", " ").replace("_", " ")
count_ngrams = CountVectorizer(preprocessor=preprocessor,
analyzer="word")
ling_stats = LinguisticVectorizer()
all_features = FeatureUnion([('ling', ling_stats), ('count', count_ngrams)])
#all_features = FeatureUnion([('count', count_ngrams)])
#all_features = FeatureUnion([('ling', ling_stats)])
clf = BernoulliNB()
pipeline = Pipeline([('all', all_features), ('clf', clf)])
if params:
pipeline.set_params(**params)
return pipeline
def __grid_search_model(clf_factory, X, Y):
cv = ShuffleSplit(
n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
param_grid = dict(all__count__ngram_range=[(1, 1), (1, 2), (1, 3)],
all__count__min_df=[1, 2],
)
grid_search = GridSearchCV(clf_factory(),
param_grid=param_grid,
cv=cv,
score_func=f1_score,
verbose=10)
grid_search.fit(X, Y)
clf = grid_search.best_estimator_
print clf
return clf
def train_model(clf, X, Y, name="NB ngram", plot=False):
# create it again for plotting
cv = ShuffleSplit(
n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)
train_errors = []
test_errors = []
scores = []
pr_scores = []
precisions, recalls, thresholds = [], [], []
clfs = [] # just to later get the median
for train, test in cv:
X_train, y_train = X[train], Y[train]
X_test, y_test = X[test], Y[test]
clf.fit(X_train, y_train)
clfs.append(clf)
train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)
train_errors.append(1 - train_score)
test_errors.append(1 - test_score)
scores.append(test_score)
proba = clf.predict_proba(X_test)
fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
precision, recall, pr_thresholds = precision_recall_curve(
y_test, proba[:, 1])
pr_scores.append(auc(recall, precision))
precisions.append(precision)
recalls.append(recall)
thresholds.append(pr_thresholds)
if plot:
scores_to_sort = pr_scores
median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
plot_pr(pr_scores[median], name, precisions[median],
recalls[median], label=name)
log_false_positives(clfs[median], X_test, y_test, name)
summary = (np.mean(scores), np.std(scores),
np.mean(pr_scores), np.std(pr_scores))
print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary
return np.mean(train_errors), np.mean(test_errors)
def print_incorrect(clf, X, Y):
Y_hat = clf.predict(X)
wrong_idx = Y_hat != Y
X_wrong = X[wrong_idx]
Y_wrong = Y[wrong_idx]
Y_hat_wrong = Y_hat[wrong_idx]
for idx in xrange(len(X_wrong)):
print "clf.predict('%s')=%i instead of %i" %\
(X_wrong[idx], Y_hat_wrong[idx], Y_wrong[idx])
def get_best_model():
best_params = dict(all__count__ngram_range=(1, 2),
all__count__min_df=1,
all__count__stop_words=None,
all__count__binary=True,
)
best_clf = create_union_model(best_params)
return best_clf
if __name__ == "__main__":
X_orig, Y_orig = load_sanders_data()
#from sklearn.utils import shuffle
#print "shuffle, sample"
#X_orig, Y_orig = shuffle(X_orig, Y_orig)
#X_orig = X_orig[:100,]
#Y_orig = Y_orig[:100,]
classes = np.unique(Y_orig)
for c in classes:
print "#%s: %i" % (c, sum(Y_orig == c))
print "== Pos vs. neg =="
pos_neg = np.logical_or(Y_orig == "positive", Y_orig == "negative")
X = X_orig[pos_neg]
Y = Y_orig[pos_neg]
Y = tweak_labels(Y, ["positive"])
train_model(get_best_model(), X, Y, name="pos vs neg", plot=True)
print "== Pos/neg vs. irrelevant/neutral =="
X = X_orig
Y = tweak_labels(Y_orig, ["positive", "negative"])
#best_clf = __grid_search_model(create_union_model, X, Y)
#print best_clf
train_model(get_best_model(), X, Y, name="pos+neg vs rest", plot=True)
print "== Pos vs. rest =="
X = X_orig
Y = tweak_labels(Y_orig, ["positive"])
train_model(get_best_model(), X, Y, name="pos vs rest",
plot=True)
print "== Neg vs. rest =="
X = X_orig
Y = tweak_labels(Y_orig, ["negative"])
train_model(get_best_model(), X, Y, name="neg vs rest",
plot=True)
print "time spent:", time.time() - start_time
json.dump(poscache, open(poscache_filename, "w"))
第七章
import numpy as np
from sklearn.datasets import load_boston
import pylab as plt
boston = load_boston()
x = np.array([np.concatenate((v,[1])) for v in boston.data]) ##concatenate 矩阵连接,vstack,hstack,等一起
y = boston.target
s,total_error,_,_ = np.linalg.lstsq(x,y)
rmse = np.sqrt(total_error[0]/len(x))
print('Residual: {}'.format(rmse)) # 注意是空的 {}, 可以两个空的连着
plt.plot(np.dot(x,s), boston.target,'ro') #矩阵乘法 dot
plt.plot([0,50],[0,50], 'g-')
plt.xlabel('predicted')
plt.ylabel('real')
plt.show()
from __future__ import print_function
from sklearn.cross_validation import KFold
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
import numpy as np
x = np.array([np.concatenate((v,[1])) for v in boston.data])
y = boston.target
for name,met in [
('elastic-net(.5)', ElasticNet(fit_intercept=True, alpha=0.5)), # 注意模型也在内部,此种写法
('lasso(.5)', Lasso(fit_intercept=True, alpha=0.5)),
('ridge(.5)', Ridge(fit_intercept=True, alpha=0.5)),
]:
met.fit(x,y)
p = np.array([met.predict(xi) for xi in x])
e = p-y
total_error = np.dot(e,e) # 注意此种写法 平方
rmse_train = np.sqrt(total_error/len(p))
kf = KFold(len(x), n_folds=10)
err = 0
for train,test in kf:
met.fit(x[train],y[train]) # 注意此种交叉验证方式设计,采用的是索引
p = np.array([met.predict(xi) for xi in x[test]])
e = p-y[test]
err += np.dot(e,e)
rmse_10cv = np.sqrt(err/len(x))
print('Method: {}'.format(name))
print('RMSE on training: {}'.format(rmse_train))
print('RMSE on 10-fold CV: {}'.format(rmse_10cv))
print()
print()
from sklearn.cross_validation import KFold
from sklearn.linear_model import LinearRegression, ElasticNet
import numpy as np
from sklearn.datasets import load_boston
boston = load_boston()
x = np.array([np.concatenate((v,[1])) for v in boston.data])
y = boston.target
FIT_EN = False
if FIT_EN:
model = ElasticNet(fit_intercept=True, alpha=0.5) # 巧妙的使用判断,和fit结合
else:
model = LinearRegression(fit_intercept=True)
model.fit(x,y)
p = np.array([model.predict(xi) for xi in x])
e = p-y
total_error = np.dot(e,e)
rmse_train = np.sqrt(total_error/len(p))
kf = KFold(len(x), n_folds=10) #此处是索引
err = 0
for train,test in kf:
model.fit(x[train],y[train])
p = np.array([model.predict(xi) for xi in x[test]])
e = p-y[test]
err += np.dot(e,e)
rmse_10cv = np.sqrt(err/len(x))
print('RMSE on training: {}'.format(rmse_train))
print('RMSE on 10-fold CV: {}'.format(rmse_10cv))
import numpy as np
from sklearn.datasets import load_boston
import pylab as plt
from mpltools import style
style.use('ggplot')
boston = load_boston()
plt.scatter(boston.data[:,5], boston.target)
plt.xlabel("RM")
plt.ylabel("House Price")
x = boston.data[:,5]
x = np.array([[v] for v in x])
y = boston.target
slope,res,_,_ = np.linalg.lstsq(x,y) #_, 使用在不想起名字不使用的情况下?
plt.plot([0,boston.data[:,5].max()+1],[0,slope*(boston.data[:,5].max()+1)], '-', lw=4)
plt.savefig('Figure1.png',dpi=150)
rmse = np.sqrt(res[0]/len(x))
print('Residual: {}'.format(rmse))
import numpy as np
from sklearn.datasets import load_boston
import pylab as plt
from mpltools import style # 使用方法,sytle.use
style.use('ggplot')
boston = load_boston()
plt.scatter(boston.data[:,5], boston.target)
plt.xlabel("RM")
plt.ylabel("House Price")
x = boston.data[:,5]
xmin = x.min()
xmax = x.max()
x = np.array([[v,1] for v in x])
y = boston.target
(slope,bias),res,_,_ = np.linalg.lstsq(x,y)
plt.plot([xmin,xmax],[slope*xmin + bias, slope*xmax + bias], '-', lw=4)
plt.savefig('Figure2.png',dpi=150)
rmse = np.sqrt(res[0]/len(x))
print('Residual: {}'.format(rmse))
import numpy as np
from sklearn.datasets import load_svmlight_file
from sklearn.linear_model import ElasticNet, LinearRegression
data,target = load_svmlight_file('E2006.train')
lr = LinearRegression(fit_intercept=True)
from sklearn.cross_validation import KFold
kf = KFold(len(target), n_folds=10)
err = 0
for train,test in kf:
lr.fit(data[train],target[train])
p = map(lr.predict, data[test])
p = np.array(p).ravel() ##注意ravel的使用, 和flatten的区别
e = p-target[test]
err += np.dot(e,e)
rmse_10cv = np.sqrt(err/len(target))
lr.fit(data,target)
p = np.array(map(lr.predict, data)) # 推到表达式也可实现
p = p.ravel()
e = p-target
total_error = np.dot(e,e)
rmse_train = np.sqrt(total_error/len(p))
print('RMSE on training: {}'.format(rmse_train))
print('RMSE on 10-fold CV: {}'.format(rmse_10cv))
import numpy as np
from scipy import sparse
from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV
from sklearn.cross_validation import KFold
data = np.array([[int(tok) for tok in line.split('\t')[:3]] for line in open('data/ml-100k/u.data')]) #多层列表推导式,取出需要的值
ij = data[:,:2]
ij -= 1 # original data is in 1-based system
values = data[:,2]
reviews = sparse.csc_matrix((values,ij.T)).astype(float) #两种稀疏矩阵压缩,astype
reg = ElasticNetCV(fit_intercept=True, alphas=[0.0125, 0.025,0.05,.125,.25,.5,1.,2.,4.])
def movie_norm(xc):
xc = xc.copy().toarray()
x1 = np.array([xi[xi > 0].mean() for xi in xc])
x1 = np.nan_to_num(x1) #注意 np.nan_to_num的使用
for i in range(xc.shape[0]):
xc[i] -= (xc[i] > 0) * x1[i]
return xc, x1
def learn_for(i):
u = reviews[i]
us = np.delete(np.arange(reviews.shape[0]), i)
ps, = np.where(u.toarray().ravel() > 0)
x = reviews[us][:,ps].T
y = u.data
err = 0
eb = 0
kf = KFold(len(y), n_folds=4)
for train,test in kf:
xc,x1 = movie_norm(x[train])
reg.fit(xc, y[train]-x1)
xc,x1 = movie_norm(x[test])
p = np.array([reg.predict(xi) for xi in xc]).ravel()
e = (p+x1)-y[test]
err += np.sum(e*e)
eb += np.sum( (y[train].mean() - y[test])**2 )
return np.sqrt(err/float(len(y))), np.sqrt(eb/float(len(y)))
whole_data = []
for i in range(reviews.shape[0]):
s = learn_for(i)
print(s[0] < s[1])
print(s)
whole_data.append(s)
第八章
def apriori(dataset, minsupport, maxsize):
'''
freqsets, baskets = apriori(dataset, minsupport, maxsize)
Parameters
----------
dataset : sequence of sequences
input dataset
minsupport : int
Minimal support for frequent items
maxsize : int
Maximal size of frequent items to return
Returns
-------
freqsets : sequence of sequences
baskets : dictionary
'''
from collections import defaultdict
baskets = defaultdict(list)
pointers = defaultdict(list)
for i,ds in enumerate(dataset):
for ell in ds:
pointers[ell].append(i)
baskets[frozenset([ell])].append(i)
pointers = dict([(k,frozenset(v)) for k,v in pointers.items()])
baskets = dict([(k,frozenset(v)) for k,v in baskets.items()])
valid = set(list(el)[0] for el,c in baskets.items() if (len(c) >= minsupport))
dataset = [[el for el in ds if (el in valid)] for ds in dataset]
dataset = [ds for ds in dataset if len(ds) > 1]
dataset = map(frozenset,dataset)
itemsets = [frozenset([v]) for v in valid]
freqsets = []
for i in range(maxsize-1):
print(len(itemsets))
newsets = []
for i,ell in enumerate(itemsets):
ccounts = baskets[ell]
for v_,pv in pointers.items():
if v_ not in ell:
csup = (ccounts & pv)
if len(csup) >= minsupport:
new = frozenset(ell|set([v_]))
if new not in baskets:
newsets.append(new)
baskets[new] = csup
freqsets.extend(itemsets)
itemsets = newsets
return freqsets,baskets
def association_rules(dataset, freqsets, baskets, minlift):
'''
for (antecendent, consequent, base, py_x, lift) in association_rules(dataset, freqsets, baskets, minlift):
...
This function takes the returns from ``apriori``.
Parameters
----------
dataset : sequence of sequences
input dataset
freqsets : sequence of sequences
baskets : dictionary
minlift : int
minimal lift of yielded rules
'''
nr_transactions = float(len(dataset))
freqsets = [f for f in freqsets if len(f) > 1]
for fset in freqsets:
for f in fset:
consequent = frozenset([f])
antecendent = fset-consequent
base = len(baskets[consequent])/nr_transactions
py_x = len(baskets[fset])/float(len(baskets[antecendent]))
lift = py_x / base
if lift > minlift:
yield (antecendent, consequent, base, py_x, lift) # 注意生成器的使用
from apriori import apriori, association_rules
from gzip import GzipFile
dataset = [ [int(tok) for tok in line.strip().split()]
for line in GzipFile('retail.dat.gz')]
freqsets,baskets = apriori(dataset, 80, maxsize=5)
nr_transactions = float(len(dataset))
for ant,con,base,pyx,lift in association_rules(dataset, freqsets,baskets, 30):
print('{} | {} | {} ({:%}) | {} | {} | {}'
.format(ant, con, len(baskets[con]), len(baskets[con])/ nr_transactions, len(baskets[ant]), len(baskets[con|ant]), int(lift)))
import numpy as np
from collections import defaultdict
from itertools import chain # 注意itertools 模块提供迭代器
from gzip import GzipFile
minsupport = 44
dataset = [ [int(tok) for tok in line.strip().split()]
for line in GzipFile('retail.dat.gz')]
dataset = dataset[::20]
counts = defaultdict(int)
for elem in chain(*dataset): #注意chain内部是迭代器,注意此处使用的函数参数为*不固定的
counts[elem] += 1
valid = set(el for el,c in counts.items() if (c >= minsupport))
dataset = [[el for el in ds if (el in valid)] for ds in dataset]
dataset = [frozenset(ds) for ds in dataset if len(ds) > 1]
itemsets = [frozenset([v]) for v in valid]
allsets = [itemsets]
for i in range(16):
print(len(itemsets))
nextsets = []
for i,ell in enumerate(itemsets):
for v_ in valid:
if v_ not in ell:
c = (ell|set([v_]))
if sum(1 for d in dataset if d.issuperset(c)) > minsupport:
nextsets.append(c)
allsets.append(nextsets)
itemsets = nextsets
import numpy as np
from collections import defaultdict
from itertools import chain
from gzip import GzipFile
dataset = [ [int(tok) for tok in line.strip().split()]
for line in GzipFile('retail.dat.gz')]
counts = defaultdict(int)
for elem in chain(*dataset):
counts[elem] += 1
counts = np.array(list(counts.values()))
bins = [1,2,4,8,16,32,64,128,512]
print(' {0:11} | {1:12}'.format('Nr of baskets', 'Nr of products'))
print('--------------------------------')
for i in range(len(bins)):
bot = bins[i]
top = (bins[i+1] if (i+1) < len(bins) else 100000000000)
print(' {0:4} - {1:3} | {2:12}'.format(bot, (top if top < 1000 else ''), np.sum( (counts >= bot) & (counts < top) )))
# 注意在.format中使用了if else等语法
import numpy as np
# This is the version in the book:
def all_correlations(bait, target):
'''
corrs = all_correlations(bait, target)
corrs[i] is the correlation between bait and target[i]
'''
return np.array(
[np.corrcoef(bait, c)[0,1]
for c in target])
# This is a faster, but harder to read, implementation:
def all_correlations(y, X):
'''
Cs = all_correlations(y, X)
Cs[i] = np.corrcoef(y, X[i])[0,1]
'''
X = np.asanyarray(X, float)
y = np.asanyarray(y, float)
xy = np.dot(X, y)
y_ = y.mean()
ys_ = y.std()
x_ = X.mean(1)
xs_ = X.std(1)
n = float(len(y))
ys_ += 1e-5 # Handle zeros in ys
xs_ += 1e-5 # Handle zeros in x
return (xy - x_*y_*n)/n/xs_/ys_
import numpy as np
# This is the version in the book:
def all_correlations(bait, target):
'''
corrs = all_correlations(bait, target)
corrs[i] is the correlation between bait and target[i]
'''
return np.array(
[np.corrcoef(bait, c)[0,1]
for c in target])
# This is a faster, but harder to read, implementation:
def all_correlations(y, X):
'''
Cs = all_correlations(y, X)
Cs[i] = np.corrcoef(y, X[i])[0,1]
'''
X = np.asanyarray(X, float) #把matrix转换为array用asarray() asanyarray()根据和你的输入的类型保持一致
y = np.asanyarray(y, float)
xy = np.dot(X, y)
y_ = y.mean()
ys_ = y.std()
x_ = X.mean(1)
xs_ = X.std(1)
n = float(len(y))
ys_ += 1e-5 # Handle zeros in ys
xs_ += 1e-5 # Handle zeros in x
return (xy - x_*y_*n)/n/xs_/ys_
from __future__ import print_function
from all_correlations import all_correlations
import numpy as np
from scipy import sparse
from load_ml100k import load
reviews = load()
def estimate_user(user, rest):
bu = user > 0
br = rest > 0
ws = all_correlations(bu,br)
selected = ws.argsort()[-100:]
estimates = rest[selected].mean(0)
estimates /= (.1+br[selected].mean(0))
return estimates
def train_test(user, rest):
estimates = estimate_user(user, rest)
bu = user > 0
br = rest > 0
err = estimates[bu]-user[bu]
null = rest.mean(0)
null /= (.1+br.mean(0))
nerr = null[bu]-user[bu]
return np.dot(err,err), np.dot(nerr, nerr)
def cross_validate_all():
err = []
for i in xrange(reviews.shape[0]):
err.append(
train_test(reviews[i], np.delete(reviews, i, 0))
)
revs = (reviews > 0).sum(1)
err = np.array(err)
rmse = np.sqrt(err / revs[:,None])
print(np.mean(rmse, 0))
print(np.mean(rmse[revs > 60], 0))
def all_estimates(reviews):
reviews = reviews.toarray()
estimates = np.zeros_like(reviews) ##注意zeros_like和ones_like 和原矩阵的shape一样, 但数据都是0, 或1
for i in xrange(reviews.shape[0]):
estimates[i] = estimate_user(reviews[i], np.delete(reviews, i, 0))
return estimates
from load_ml100k import load
from matplotlib import pyplot as plt
data = load()
data = data.toarray() # 和todense一样,都是把稀疏举证转为数组
plt.gray()
plt.imshow(data[:200,:200], interpolation='nearest') ##imshow显示二值型数据,颜色深浅
plt.xlabel('User ID')
plt.ylabel('Film ID')
plt.savefig('../1400_08_03+.png')
import numpy as np
from scipy import sparse
def load():
data = np.array([[int(t) for t in line.split('\t')[:3]] for line in open('data/ml-100k/u.data')])
ij = data[:,:2]
ij -= 1 # original data is in 1-based system
values = data[:,2]
reviews = sparse.csc_matrix((values,ij.T)).astype(float) ##csc 按列压缩矩阵, csr按行压缩矩阵
return reviews
from __future__ import print_function
import numpy as np
from load_ml100k import load
from all_correlations import all_correlations
def nn_movie(ureviews, reviews, uid, mid, k=1):
X = ureviews
y = ureviews[mid].copy()
y -= y.mean()
y /= (y.std()+1e-5)
corrs = np.dot(X, y)
likes = corrs.argsort()
likes = likes[::-1]
c = 0
pred = 3.
for ell in likes:
if ell == mid:
continue
if reviews[uid,ell] > 0:
pred = reviews[uid,ell]
if c == k:
return pred
c += 1
return pred
def all_estimates(reviews, k=1):
reviews = reviews.astype(float)
k -= 1
nusers, nmovies = reviews.shape
estimates = np.zeros_like(reviews)
for u in range(nusers):
ureviews = np.delete(reviews, u, 0)
ureviews -= ureviews.mean(0)
ureviews /= (ureviews.std(0)+1e-4)
ureviews = ureviews.T.copy()
for m in np.where(reviews[u] > 0)[0]:
estimates[u,m] = nn_movie(ureviews, reviews, u, m, k)
return estimates
if __name__ == '__main__':
reviews = load().toarray()
estimates = all_estimates(reviews)
error = (estimates-reviews)
error **= 2 #注意**用法
error = error[reviews > 0]
print(np.sqrt(error).mean())
from __future__ import print_function
from sklearn.linear_model import LinearRegression
from load_ml100k import load
import numpy as np
import similar_movie
import usermodel
import corrneighbours
reviews = load()
reg = LinearRegression()
es = np.array([
usermodel.all_estimates(reviews),
corrneighbours.all_estimates(reviews),
similar_movies.all_estimates(reviews),
])
reviews = reviews.toarray()
total_error = 0.0
coefficients = []
for u in xrange(reviews.shape[0]):
es0 = np.delete(es,u,1)
r0 = np.delete(reviews, u, 0)
X,Y = np.where(r0 > 0)
X = es[:,X,Y]
y = r0[r0 > 0]
reg.fit(X.T,y)
coefficients.append(reg.coef_)
r0 = reviews[u]
X = np.where(r0 > 0)
p0 = reg.predict(es[:,u,X].squeeze().T)
err0 = r0[r0 > 0]-p0
total_error += np.dot(err0,err0)
print(u)
from sklearn.linear_model import LinearRegression
from load_ml100k import load
import numpy as np
import similar_movie
import usermodel
import corrneighbours
sreviews = load()
reviews = sreviews.toarray()
reg = LinearRegression()
es = np.array([
usermodel.all_estimates(sreviews),
similar_movie.all_estimates(reviews,k=1),
similar_movie.all_estimates(reviews,k=2),
similar_movie.all_estimates(reviews,k=3),
similar_movie.all_estimates(reviews,k=4),
similar_movie.all_estimates(reviews,k=5),
])
total_error = 0.0
coefficients = []
for u in xrange(reviews.shape[0]):
es0 = np.delete(es,u,1) #delete的用法 留一
r0 = np.delete(reviews, u, 0)
X,Y = np.where(r0 > 0)
X = es[:,X,Y]
y = r0[r0 > 0]
reg.fit(X.T,y)
coefficients.append(reg.coef_)
r0 = reviews[u]
X = np.where(r0 > 0)
p0 = reg.predict(es[:,u,X].squeeze().T)
err0 = r0[r0 > 0]-p0
total_error += np.dot(err0,err0)
coefficients = np.array(coefficients)
import numpy as np
from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV
from sklearn.cross_validation import KFold
from load_ml100k import load
def learn_for(reviews, i):
reg = ElasticNetCV(fit_intercept=True, alphas=[0.0125, 0.025,0.05,.125,.25,.5,1.,2.,4.])
u = reviews[i]
us = range(reviews.shape[0])
del us[i]
ps, = np.where(u.toarray().ravel() > 0)
x = reviews[us][:,ps].T
y = u.data
kf = KFold(len(y), n_folds=4)
predictions = np.zeros(len(ps))
for train,test in kf:
xc = x[train].copy().toarray()
x1 = np.array([xi[xi > 0].mean() for xi in xc])
x1 = np.nan_to_num(x1)
for i in xrange(xc.shape[0]):
xc[i] -= (xc[i] > 0) * x1[i]
reg.fit(xc, y[train]-x1)
xc = x[test].copy().toarray()
x1 = np.array([xi[xi > 0].mean() for xi in xc])
第九章
import os
from matplotlib import pylab
import numpy as np
DATA_DIR = os.path.join("..", "data")
CHART_DIR = os.path.join("..", "charts")
GENRE_DIR = "/media/sf_P/pymlbook-data/09-genre-class/genres"
GENRE_LIST = ["classical", "jazz", "country", "pop", "rock", "metal"]
def plot_confusion_matrix(cm, genre_list, name, title):
pylab.clf()
pylab.matshow(cm, fignum=False, cmap='Blues', vmin=0, vmax=1.0) # 注意用matshow画混淆举证,并设置cmap颜色等
ax = pylab.axes()
ax.set_xticks(range(len(genre_list))) # 注意图形的建立方式
ax.set_xticklabels(genre_list)
ax.xaxis.set_ticks_position("bottom")
ax.set_yticks(range(len(genre_list)))
ax.set_yticklabels(genre_list)
pylab.title(title)
pylab.colorbar()
pylab.grid(False)
pylab.show()
pylab.xlabel('Predicted class')
pylab.ylabel('True class')
pylab.grid(False)
pylab.savefig(os.path.join(CHART_DIR, "confusion_matrix_%s.png"%name), bbox_inches="tight")
def plot_pr(auc_score, name, precision, recall, label=None):
pylab.clf()
pylab.figure(num=None, figsize=(5, 4))
pylab.grid(True)
pylab.fill_between(recall, precision, alpha=0.5) # 使用fill_between
pylab.plot(recall, precision, lw=1)
pylab.xlim([0.0, 1.0])
pylab.ylim([0.0, 1.0])
pylab.xlabel('Recall')
pylab.ylabel('Precision')
pylab.title('P/R curve (AUC = %0.2f) / %s' % (auc_score, label))
filename = name.replace(" ", "_")
pylab.savefig(os.path.join(CHART_DIR, "pr_" + filename + ".png"), bbox_inches="tight")
def plot_roc(auc_score, name, tpr, fpr, label=None):
pylab.clf()
pylab.figure(num=None, figsize=(5, 4))
pylab.grid(True)
pylab.plot([0, 1], [0, 1], 'k--')
pylab.plot(fpr, tpr)
pylab.fill_between(fpr, tpr, alpha=0.5)
pylab.xlim([0.0, 1.0])
pylab.ylim([0.0, 1.0])
pylab.xlabel('False Positive Rate')
pylab.ylabel('True Positive Rate')
pylab.title('ROC curve (AUC = %0.2f) / %s' % (auc_score, label), verticalalignment="bottom")
pylab.legend(loc="lower right")
filename = name.replace(" ", "_")
pylab.savefig(os.path.join(CHART_DIR, "roc_" + filename + ".png"), bbox_inches="tight")
def show_most_informative_features(vectorizer, clf, n=20):
c_f = sorted(zip(clf.coef_[0], vectorizer.get_feature_names()))
top = zip(c_f[:n], c_f[:-(n + 1):-1])
for (c1, f1), (c2, f2) in top:
print "\t%.4f\t%-15s\t\t%.4f\t%-15s" % (c1, f1, c2, f2)
def plot_log():
pylab.clf()
x = np.arange(0.001, 1, 0.001)
y = np.log(x)
pylab.title('Relationship between probabilities and their logarithm')
pylab.plot(x, y)
pylab.grid(True)
pylab.xlabel('P')
pylab.ylabel('log(P)')
filename = 'log_probs.png'
pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
def plot_feat_importance(feature_names, clf, name):
pylab.clf()
coef_ = clf.coef_
important = np.argsort(np.absolute(coef_.ravel()))
f_imp = feature_names[important]
coef = coef_.ravel()[important]
inds = np.argsort(coef)
f_imp = f_imp[inds]
coef = coef[inds]
xpos = np.array(range(len(coef)))
pylab.bar(xpos, coef, width=1)
pylab.title('Feature importance for %s' % (name))
ax = pylab.gca()
ax.set_xticks(np.arange(len(coef)))
labels = ax.set_xticklabels(f_imp)
for label in labels:
label.set_rotation(90)
filename = name.replace(" ", "_")
pylab.savefig(os.path.join(
CHART_DIR, "feat_imp_%s.png" % filename), bbox_inches="tight")
def plot_feat_hist(data_name_list, filename=None):
pylab.clf()
# import pdb;pdb.set_trace()
num_rows = 1 + (len(data_name_list) - 1) / 2
num_cols = 1 if len(data_name_list) == 1 else 2
pylab.figure(figsize=(5 * num_cols, 4 * num_rows))
for i in range(num_rows):
for j in range(num_cols):
pylab.subplot(num_rows, num_cols, 1 + i * num_cols + j) # 注意subplot 动态设置行列。
x, name = data_name_list[i * num_cols + j]
pylab.title(name)
pylab.xlabel('Value')
pylab.ylabel('Density')
# the histogram of the data
max_val = np.max(x)
if max_val <= 1.0:
bins = 50
elif max_val > 50:
bins = 50
else:
bins = max_val
n, bins, patches = pylab.hist(
x, bins=bins, normed=1, facecolor='green', alpha=0.75)
pylab.grid(True)
if not filename:
filename = "feat_hist_%s.png" % name
pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
def plot_bias_variance(data_sizes, train_errors, test_errors, name):
pylab.clf()
pylab.ylim([0.0, 1.0])
pylab.xlabel('Data set size')
pylab.ylabel('Error')
pylab.title("Bias-Variance for '%s'" % name)
pylab.plot(
data_sizes, train_errors, "-", data_sizes, test_errors, "--", lw=1)
pylab.legend(["train error", "test error"], loc="upper right")
pylab.grid(True)
pylab.savefig(os.path.join(CHART_DIR, "bv_" + name + ".png"))
import numpy as np
from collections import defaultdict
from sklearn.metrics import precision_recall_curve, roc_curve
from sklearn.metrics import auc
from sklearn.cross_validation import ShuffleSplit
from sklearn.metrics import confusion_matrix
from utils import plot_pr, plot_roc, plot_confusion_matrix, GENRE_LIST
from fft import read_fft
TEST_DIR = "/media/sf_P/pymlbook-data/09-genre-class/private"
genre_list = GENRE_LIST
def train_model(clf_factory, X, Y, name, plot=False):
labels = np.unique(Y) #np.unique函数使用
cv = ShuffleSplit(
n=len(X), n_iter=1, test_size=0.3, indices=True, random_state=0)
train_errors = []
test_errors = []
scores = []
pr_scores = defaultdict(list)
precisions, recalls, thresholds = defaultdict(
list), defaultdict(list), defaultdict(list)
roc_scores = defaultdict(list)
tprs = defaultdict(list)
fprs = defaultdict(list)
clfs = [] # just to later get the median
cms = []
for train, test in cv:
X_train, y_train = X[train], Y[train]
X_test, y_test = X[test], Y[test]
clf = clf_factory()
clf.fit(X_train, y_train)
clfs.append(clf)
train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)
scores.append(test_score)
train_errors.append(1 - train_score)
test_errors.append(1 - test_score)
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cms.append(cm)
for label in labels:
y_label_test = np.asarray(y_test == label, dtype=int) # 注意在此处嵌入dtype语句
proba = clf.predict_proba(X_test)
proba_label = proba[:, label]
precision, recall, pr_thresholds = precision_recall_curve(
y_label_test, proba_label)
pr_scores[label].append(auc(recall, precision))
precisions[label].append(precision)
recalls[label].append(recall)
thresholds[label].append(pr_thresholds)
fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label)
roc_scores[label].append(auc(fpr, tpr))
tprs[label].append(tpr)
fprs[label].append(fpr)
if plot:
for label in labels:
print "Plotting", genre_list[label]
scores_to_sort = roc_scores[label]
median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
desc = "%s %s" % (name, genre_list[label])
plot_pr(pr_scores[label][median], desc, precisions[label][median],
recalls[label][median], label='%s vs rest' % genre_list[label])
plot_roc(roc_scores[label][median], desc, tprs[label][median],
fprs[label][median], label='%s vs rest' % genre_list[label])
all_pr_scores = np.asarray(pr_scores.values()).flatten()
summary = (np.mean(scores), np.std(scores),
np.mean(all_pr_scores), np.std(all_pr_scores))
print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary
return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
def create_model():
from sklearn.linear_model.logistic import LogisticRegression
clf = LogisticRegression()
return clf
if __name__ == "__main__":
X, y = read_fft(genre_list)
train_avg, test_avg, cms = train_model(
create_model, X, y, "Log Reg FFT", plot=True)
cm_avg = np.mean(cms, axis=0)
cm_norm = cm_avg / np.sum(cm_avg, axis=0)
print cm_norm
plot_confusion_matrix(cm_norm, genre_list, "fft",
"Confusion matrix of an FFT based classifier")
import numpy as np
from collections import defaultdict
from sklearn.metrics import precision_recall_curve, roc_curve
from sklearn.metrics import auc
from sklearn.cross_validation import ShuffleSplit
from sklearn.metrics import confusion_matrix
from utils import plot_roc, plot_confusion_matrix, GENRE_LIST
from ceps import read_ceps
TEST_DIR = "/media/sf_P/pymlbook-data/09-genre-class/private"
genre_list = GENRE_LIST
def train_model(clf_factory, X, Y, name, plot=False):
labels = np.unique(Y)
cv = ShuffleSplit(
n=len(X), n_iter=1, test_size=0.3, indices=True, random_state=0)
train_errors = []
test_errors = []
scores = []
pr_scores = defaultdict(list)
precisions, recalls, thresholds = defaultdict(
list), defaultdict(list), defaultdict(list)
roc_scores = defaultdict(list)
tprs = defaultdict(list)
fprs = defaultdict(list)
clfs = [] # just to later get the median
cms = []
for train, test in cv:
X_train, y_train = X[train], Y[train]
X_test, y_test = X[test], Y[test]
clf = clf_factory()
clf.fit(X_train, y_train)
clfs.append(clf)
train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)
scores.append(test_score)
train_errors.append(1 - train_score)
test_errors.append(1 - test_score)
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cms.append(cm)
for label in labels:
y_label_test = np.asarray(y_test == label, dtype=int)
proba = clf.predict_proba(X_test)
proba_label = proba[:, label]
precision, recall, pr_thresholds = precision_recall_curve(
y_label_test, proba_label)
pr_scores[label].append(auc(recall, precision))
precisions[label].append(precision)
recalls[label].append(recall)
thresholds[label].append(pr_thresholds)
fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label)
roc_scores[label].append(auc(fpr, tpr))
tprs[label].append(tpr)
fprs[label].append(fpr)
if plot:
for label in labels:
print "Plotting", genre_list[label]
scores_to_sort = roc_scores[label]
median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
desc = "%s %s" % (name, genre_list[label])
plot_roc(roc_scores[label][median], desc, tprs[label][median],
fprs[label][median], label='%s vs rest' % genre_list[label])
all_pr_scores = np.asarray(pr_scores.values()).flatten()
summary = (np.mean(scores), np.std(scores),
np.mean(all_pr_scores), np.std(all_pr_scores))
print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary
return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
def create_model():
from sklearn.linear_model.logistic import LogisticRegression
clf = LogisticRegression()
return clf
if __name__ == "__main__":
X, y = read_ceps(genre_list)
train_avg, test_avg, cms = train_model(
create_model, X, y, "Log Reg CEPS", plot=True)
cm_avg = np.mean(cms, axis=0)
cm_norm = cm_avg / np.sum(cm_avg, axis=0)
print cm_norm
plot_confusion_matrix(cm_norm, genre_list, "ceps",
"Confusion matrix of a CEPS based classifier")
import os
import glob
import sys
import numpy as np
import scipy
import scipy.io.wavfile #利用scipy对wav文件进行处理
from scikits.talkbox.features import mfcc #talkbox对音频进行XXX变换的包
from utils import GENRE_DIR
def write_ceps(ceps, fn):
"""
Write the MFCC to separate files to speed up processing.
"""
base_fn, ext = os.path.splitext(fn)
data_fn = base_fn + ".ceps"
np.save(data_fn, ceps)
print "Written", data_fn
def create_ceps(fn):
sample_rate, X = scipy.io.wavfile.read(fn)
ceps, mspec, spec = mfcc(X)
write_ceps(ceps, fn)
def read_ceps(genre_list, base_dir=GENRE_DIR):
X = []
y = []
for label, genre in enumerate(genre_list):
for fn in glob.glob(os.path.join(base_dir, genre, "*.ceps.npy")):
ceps = np.load(fn)
num_ceps = len(ceps)
X.append(
np.mean(ceps[int(num_ceps / 10):int(num_ceps * 9 / 10)], axis=0))
y.append(label)
return np.array(X), np.array(y)
if __name__ == "__main__":
os.chdir(GENRE_DIR)
glob_wav = os.path.join(sys.argv[1], "*.wav")
print glob_wav
for fn in glob.glob(glob_wav):
create_ceps(fn)
import sys
import os
import glob
import numpy as np
import scipy
import scipy.io.wavfile
from utils import GENRE_DIR, CHART_DIR
import matplotlib.pyplot as plt
from matplotlib.ticker import EngFormatter
def write_fft(fft_features, fn):
"""
Write the FFT features to separate files to speed up processing.
"""
base_fn, ext = os.path.splitext(fn)
data_fn = base_fn + ".fft"
np.save(data_fn, fft_features)
print "Written", data_fn
def create_fft(fn):
sample_rate, X = scipy.io.wavfile.read(fn)
fft_features = abs(scipy.fft(X)[:1000])
write_fft(fft_features, fn)
def read_fft(genre_list, base_dir=GENRE_DIR):
X = []
y = []
for label, genre in enumerate(genre_list):
genre_dir = os.path.join(base_dir, genre, "*.fft.npy")
file_list = glob.glob(genre_dir)
assert(file_list), genre_dir
for fn in file_list:
fft_features = np.load(fn)
X.append(fft_features[:2000])
y.append(label)
return np.array(X), np.array(y)
def plot_wav_fft(wav_filename, desc=None):
plt.clf()
plt.figure(num=None, figsize=(6, 4))
sample_rate, X = scipy.io.wavfile.read(wav_filename)
spectrum = np.fft.fft(X)
freq = np.fft.fftfreq(len(X), 1.0 / sample_rate)
plt.subplot(211)
num_samples = 200.0
plt.xlim(0, num_samples / sample_rate)
plt.xlabel("time [s]")
plt.title(desc or wav_filename)
plt.plot(np.arange(num_samples) / sample_rate, X[:num_samples])
plt.grid(True)
plt.subplot(212)
plt.xlim(0, 5000)
plt.xlabel("frequency [Hz]")
plt.xticks(np.arange(5) * 1000)
if desc:
desc = desc.strip()
fft_desc = desc[0].lower() + desc[1:]
else:
fft_desc = wav_filename
plt.title("FFT of %s" % fft_desc)
plt.plot(freq, abs(spectrum), linewidth=5)
plt.grid(True)
plt.tight_layout()
rel_filename = os.path.split(wav_filename)[1]
plt.savefig("%s_wav_fft.png" % os.path.splitext(rel_filename)[0],
bbox_inches='tight')
plt.show()
def plot_wav_fft_demo():
plot_wav_fft("sine_a.wav", "400Hz sine wave")
plot_wav_fft("sine_b.wav", "3,000Hz sine wave")
plot_wav_fft("sine_mix.wav", "Mixed sine wave")
def plot_specgram(ax, fn):
sample_rate, X = scipy.io.wavfile.read(fn)
ax.specgram(X, Fs=sample_rate, xextent=(0, 30))
def plot_specgrams(base_dir=CHART_DIR):
"""
Plot a bunch of spectrograms of wav files in different genres
"""
plt.clf()
genres = ["classical", "jazz", "country", "pop", "rock", "metal"]
num_files = 3
f, axes = plt.subplots(len(genres), num_files)
for genre_idx, genre in enumerate(genres):
for idx, fn in enumerate(glob.glob(os.path.join(GENRE_DIR, genre, "*.wav"))):
if idx == num_files:
break
axis = axes[genre_idx, idx]
axis.yaxis.set_major_formatter(EngFormatter())
axis.set_title("%s song %i" % (genre, idx + 1))
plot_specgram(axis, fn) #经过后台处理,直接使用specgram画图
specgram_file = os.path.join(base_dir, "Spectrogram_Genres.png")
plt.savefig(specgram_file, bbox_inches="tight")
plt.show()
if __name__ == "__main__":
# for fn in glob.glob(os.path.join(sys.argv[1], "*.wav")):
# create_fft(fn)
# plot_decomp()
if len(sys.argv) > 1:
plot_wav_fft(sys.argv[1], desc="some sample song")
else:
plot_wav_fft_demo()
plot_specgrams()
第十章 图像处理
import numpy as np
import mahotas as mh #读取图像文件
def edginess_sobel(image):
'''
edgi = edginess_sobel(image)
Measure the "edginess" of an image
'''
edges = mh.sobel(image, just_filter=True)
edges = edges.ravel()
return np.sqrt(np.dot(edges, edges))
import numpy as np
import mahotas as mh
text = mh.imread("simple-dataset/text21.jpg")
scene = mh.imread("simple-dataset/scene00.jpg")
h,w,_ = text.shape
canvas = np.zeros((h,2*w+128,3), np.uint8)
canvas[:,-w:] = scene
canvas[:,:w] = text
canvas = canvas[::4,::4]
mh.imsave('../1400OS_10_10+.jpg', canvas)
import mahotas as mh
from mahotas.colors import rgb2grey
import numpy as np
im = mh.imread('lenna.jpg')
im = rgb2grey(im) #灰度处理
salt = np.random.random(im.shape) > .975
pepper = np.random.random(im.shape) > .975
im = np.maximum(salt*170, mh.stretch(im))
im = np.minimum(pepper*30 + im*(~pepper), im)
mh.imsave('../1400OS_10_13+.jpg', im.astype(np.uint8))
import mahotas as mh
from sklearn import cross_validation
from sklearn.linear_model.logistic import LogisticRegression
from mpltools import style
from matplotlib import pyplot as plt
import numpy as np
from glob import glob
basedir = 'AnimTransDistr'
def features_for(images):
fs = []
for im in images:
im = mh.imread(im,as_grey=True).astype(np.uint8)
fs.append(mh.features.haralick(im).mean(0))
return np.array(fs)
def features_labels(groups):
labels = np.zeros(sum(map(len,groups)))
st = 0
for i,g in enumerate(groups):
labels[st:st+len(g)] = i
st += len(g)
return np.vstack(groups), labels
classes = [
'Anims',
'Cars',
'Distras',
'Trans',
]
features = []
labels = []
for ci,cl in enumerate(classes):
images = glob('{}/{}/*.jpg'.format(basedir,cl))
features.extend(features_for(images))
labels.extend([ci for _ in images])
features = np.array(features)
labels = np.array(labels)
scores0 = cross_validation.cross_val_score(LogisticRegression(), features, labels, cv=10)
print('Accuracy (5 fold x-val) with Logistic Regrssion [std features]: %s%%' % (0.1* round(1000*scores0.mean())))
tfeatures = features
from sklearn.cluster import KMeans
from mahotas.features import surf
images = []
labels = []
for ci,cl in enumerate(classes):
curimages = glob('{}/{}/*.jpg'.format(basedir,cl))
images.extend(curimages)
labels.extend([ci for _ in curimages])
labels = np.array(labels)
alldescriptors = []
for im in images:
im = mh.imread(im, as_grey=1)
im = im.astype(np.uint8)
#alldescriptors.append(surf.dense(im, spacing=max(im.shape)//32))
alldescriptors.append(surf.surf(im, descriptor_only=True))
print('Descriptors done')
k = 256
km = KMeans(k)
concatenated = np.concatenate(alldescriptors)
concatenated = concatenated[::64]
print('k-meaning...')
km.fit(concatenated)
features = []
for d in alldescriptors:
c = km.predict(d)
features.append(
np.array([np.sum(c == i) for i in xrange(k)])
)
features = np.array(features)
print('predicting...')
scoreSURFlr = cross_validation.cross_val_score(LogisticRegression(), features, labels, cv=5).mean()
print('Accuracy (5 fold x-val) with Log. Reg [SURF features]: %s%%' % (0.1* round(1000*scoreSURFlr.mean())))
print('combined...')
allfeatures = np.hstack([features, tfeatures])
scoreSURFplr = cross_validation.cross_val_score(LogisticRegression(), allfeatures, labels, cv=5).mean()
print('Accuracy (5 fold x-val) with Log. Reg [All features]: %s%%' % (0.1* round(1000*scoreSURFplr.mean())))
style.use('ggplot')
plt.plot([0,1,2],100*np.array([scores0.mean(), scoreSURFlr, scoreSURFplr]), 'k-', lw=8)
plt.plot([0,1,2],100*np.array([scores0.mean(), scoreSURFlr, scoreSURFplr]), 'o', mec='#cccccc', mew=12, mfc='white')
plt.xlim(-.5,2.5)
plt.ylim(scores0.mean()*90., scoreSURFplr*110)
plt.xticks([0,1,2], ["baseline", "SURF", "combined"])
plt.ylabel('Accuracy (%)')
plt.savefig('../1400OS_10_18+.png')
from matplotlib import pyplot as plt
import numpy as np
import mahotas as mh
image = mh.imread('../1400OS_10_01.jpeg')
image = mh.colors.rgb2gray(image)
im8 = mh.gaussian_filter(image,8)
im16 = mh.gaussian_filter(image,16)
im32 = mh.gaussian_filter(image,32)
h,w = im8.shape
canvas = np.ones((h,3*w+256), np.uint8)
canvas *= 255
canvas[:,:w] = im8
canvas[:,w+128:2*w+128] = im16
canvas[:,-w:] = im32
mh.imsave('../1400OS_10_05+.jpg', canvas[:,::2])
im32 = mh.stretch(im32)
ot32 = mh.otsu(im32)
mh.imsave('../1400OS_10_06+.jpg', (im32 > ot32).astype(np.uint8)*255)
from matplotlib import pyplot as plt
import numpy as np
import mahotas as mh
image = mh.imread('../1400OS_10_01.jpeg')
image = mh.colors.rgb2gray(image, dtype=np.uint8)
image = image[::4,::4]
thresh = mh.sobel(image)
filtered = mh.sobel(image, just_filter=True)
thresh = mh.dilate(thresh,np.ones((7,7)))
filtered = mh.dilate(mh.stretch(filtered),np.ones((7,7)))
h,w = thresh.shape
canvas = 255*np.ones((h,w*2+64), np.uint8)
canvas[:,:w] = thresh*255
canvas[:,-w:] = filtered
mh.imsave('../1400OS_10_09+.jpg', canvas)
import mahotas as mh
import numpy as np
im = mh.imread('lenna.jpg')
r,g,b = im.transpose(2,0,1)
h,w = r.shape
r12 = mh.gaussian_filter(r, 12.)
g12 = mh.gaussian_filter(g, 12.)
b12 = mh.gaussian_filter(b, 12.)
im12 = mh.as_rgb(r12,g12,b12)
X,Y = np.mgrid[:h,:w]
X = X-h/2.
Y = Y-w/2.
X /= X.max()
Y /= Y.max()
C = np.exp(-2.*(X**2+ Y**2))
C -= C.min()
C /= C.ptp()
C = C[:,:,None]
ring = mh.stretch(im*C + (1-C)*im12)
mh.imsave('lenna-ring.jpg', ring)
import mahotas as mh
from sklearn import cross_validation
from sklearn.linear_model.logistic import LogisticRegression
import numpy as np
from glob import glob
from edginess import edginess_sobel
basedir = 'simple-dataset'
def features_for(im):
im = mh.imread(im,as_grey=True).astype(np.uint8)
return mh.features.haralick(im).mean(0)
features = []
sobels = []
labels = []
images = glob('{}/*.jpg'.format(basedir))
for im in images:
features.append(features_for(im))
sobels.append(edginess_sobel(mh.imread(im, as_grey=True)))
labels.append(im[:-len('00.jpg')])
features = np.array(features)
labels = np.array(labels)
scores = cross_validation.cross_val_score(LogisticRegression(), features, labels, cv=5)
print('Accuracy (5 fold x-val) with Logistic Regrssion [std features]: {}%'.format(0.1* round(1000*scores.mean())))
scores = cross_validation.cross_val_score(LogisticRegression(), np.hstack([np.atleast_2d(sobels).T,features]), labels, cv=5).mean()
print('Accuracy (5 fold x-val) with Logistic Regrssion [std features + sobel]: {}%'.format(0.1* round(1000*scores.mean())))
import numpy as np
import mahotas as mh
image = mh.imread('../1400OS_10_01.jpeg')
image = mh.colors.rgb2gray(image, dtype=np.uint8)
thresh = mh.thresholding.otsu(image)
print(thresh)
otsubin = (image > thresh)
mh.imsave('otsu-threshold.jpeg', otsubin.astype(np.uint8)*255)
otsubin = ~ mh.close(~otsubin, np.ones((15,15)))
mh.imsave('otsu-closed.jpeg', otsubin.astype(np.uint8)*255)
thresh = mh.thresholding.rc(image)
print(thresh)
mh.imsave('rc-threshold.jpeg', (image > thresh).astype(np.uint8)*255)
第十一章 相关性
import os
from matplotlib import pylab
import numpy as np
import scipy
from scipy.stats import norm, pearsonr ##相关性
DATA_DIR = os.path.join("..", "data")
CHART_DIR = os.path.join("..", "charts")
def _plot_correlation_func(x, y):
r, p = pearsonr(x, y)
title = "Cor($X_1$, $X_2$) = %.3f" % r
pylab.scatter(x, y)
pylab.title(title)
pylab.xlabel("$X_1$")
pylab.ylabel("$X_2$")
f1 = scipy.poly1d(scipy.polyfit(x, y, 1))
pylab.plot(x, f1(x), "r--", linewidth=2)
# pylab.xticks([w*7*24 for w in [0,1,2,3,4]], ['week %i'%(w+1) for w in
# [0,1,2,3,4]])
def plot_correlation_demo():
np.random.seed(0) # to reproduce the data later on
pylab.clf()
pylab.figure(num=None, figsize=(8, 8))
x = np.arange(0, 10, 0.2)
pylab.subplot(221)
y = 0.5 * x + norm.rvs(1, loc=0, scale=.01, size=len(x))
_plot_correlation_func(x, y)
pylab.subplot(222)
y = 0.5 * x + norm.rvs(1, loc=0, scale=.1, size=len(x))
_plot_correlation_func(x, y)
pylab.subplot(223)
y = 0.5 * x + norm.rvs(1, loc=0, scale=1, size=len(x))
_plot_correlation_func(x, y)
pylab.subplot(224)
y = norm.rvs(1, loc=0, scale=10, size=len(x))
_plot_correlation_func(x, y)
pylab.autoscale(tight=True)
pylab.grid(True)
filename = "corr_demo_1.png"
pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
pylab.clf()
pylab.figure(num=None, figsize=(8, 8))
x = np.arange(-5, 5, 0.2)
pylab.subplot(221)
y = 0.5 * x ** 2 + norm.rvs(1, loc=0, scale=.01, size=len(x))
_plot_correlation_func(x, y)
pylab.subplot(222)
y = 0.5 * x ** 2 + norm.rvs(1, loc=0, scale=.1, size=len(x))
_plot_correlation_func(x, y)
pylab.subplot(223)
y = 0.5 * x ** 2 + norm.rvs(1, loc=0, scale=1, size=len(x))
_plot_correlation_func(x, y)
pylab.subplot(224)
y = 0.5 * x ** 2 + norm.rvs(1, loc=0, scale=10, size=len(x))
_plot_correlation_func(x, y)
pylab.autoscale(tight=True)
pylab.grid(True)
filename = "corr_demo_2.png"
pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
if __name__ == '__main__':
plot_correlation_demo()
import os
from matplotlib import pylab
import numpy as np
from scipy.stats import norm, entropy #互信息的计算
DATA_DIR = os.path.join("..", "data")
CHART_DIR = os.path.join("..", "charts")
def mutual_info(x, y, bins=10):
counts_xy, bins_x, bins_y = np.histogram2d(x, y, bins=(bins, bins))
counts_x, bins = np.histogram(x, bins=bins)
counts_y, bins = np.histogram(y, bins=bins)
counts_xy += 1
counts_x += 1
counts_y += 1
P_xy = counts_xy / np.sum(counts_xy, dtype=float)
P_x = counts_x / np.sum(counts_x, dtype=float)
P_y = counts_y / np.sum(counts_y, dtype=float)
I_xy = np.sum(P_xy * np.log2(P_xy / (P_x.reshape(-1, 1) * P_y)))
return I_xy / (entropy(counts_x) + entropy(counts_y))
def plot_entropy():
pylab.clf()
pylab.figure(num=None, figsize=(5, 4))
title = "Entropy $H(X)$"
pylab.title(title)
pylab.xlabel("$P(X=$coin will show heads up$)$")
pylab.ylabel("$H(X)$")
pylab.xlim(xmin=0, xmax=1.1)
x = np.arange(0.001, 1, 0.001)
y = -x * np.log2(x) - (1 - x) * np.log2(1 - x)
pylab.plot(x, y)
# pylab.xticks([w*7*24 for w in [0,1,2,3,4]], ['week %i'%(w+1) for w in
# [0,1,2,3,4]])
pylab.autoscale(tight=True)
pylab.grid(True)
filename = "entropy_demo.png"
pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
def _plot_mi_func(x, y):
mi = mutual_info(x, y)
title = "NI($X_1$, $X_2$) = %.3f" % mi
pylab.scatter(x, y)
pylab.title(title)
pylab.xlabel("$X_1$")
pylab.ylabel("$X_2$")
def plot_mi_demo():
np.random.seed(0) # to reproduce the data later on
pylab.clf()
pylab.figure(num=None, figsize=(8, 8))
x = np.arange(0, 10, 0.2)
pylab.subplot(221)
y = 0.5 * x + norm.rvs(1, loc=0, scale=.01, size=len(x))
_plot_mi_func(x, y)
pylab.subplot(222)
y = 0.5 * x + norm.rvs(1, loc=0, scale=.1, size=len(x))
_plot_mi_func(x, y)
pylab.subplot(223)
y = 0.5 * x + norm.rvs(1, loc=0, scale=1, size=len(x))
_plot_mi_func(x, y)
pylab.subplot(224)
y = norm.rvs(1, loc=0, scale=10, size=len(x))
_plot_mi_func(x, y)
pylab.autoscale(tight=True)
pylab.grid(True)
filename = "mi_demo_1.png"
pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
pylab.clf()
pylab.figure(num=None, figsize=(8, 8))
x = np.arange(-5, 5, 0.2)
pylab.subplot(221)
y = 0.5 * x ** 2 + norm.rvs(1, loc=0, scale=.01, size=len(x))
_plot_mi_func(x, y)
pylab.subplot(222)
y = 0.5 * x ** 2 + norm.rvs(1, loc=0, scale=.1, size=len(x))
_plot_mi_func(x, y)
pylab.subplot(223)
y = 0.5 * x ** 2 + norm.rvs(1, loc=0, scale=1, size=len(x))
_plot_mi_func(x, y)
pylab.subplot(224)
y = 0.5 * x ** 2 + norm.rvs(1, loc=0, scale=10, size=len(x))
_plot_mi_func(x, y)
pylab.autoscale(tight=True)
pylab.grid(True)
filename = "mi_demo_2.png"
pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
if __name__ == '__main__':
plot_entropy()
plot_mi_demo()
from sklearn.feature_selection import RFE #特征选择方法
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
X, y = make_classification(
n_samples=100, n_features=10, n_informative=3, random_state=0)
clf = LogisticRegression()
clf.fit(X, y)
for i in range(1, 11):
selector = RFE(clf, i)
selector = selector.fit(X, y)
print("%i\t%s\t%s" % (i, selector.support_, selector.ranking_))
import os
from matplotlib import pylab
import numpy as np
from sklearn import linear_model, decomposition #PCA方法
from sklearn import lda #LDA方法
logistic = linear_model.LogisticRegression()
CHART_DIR = os.path.join("..", "charts")
np.random.seed(3)
x1 = np.arange(0, 10, .2)
x2 = x1 + np.random.normal(loc=0, scale=1, size=len(x1))
def plot_simple_demo_1():
pylab.clf()
fig = pylab.figure(num=None, figsize=(10, 4))
pylab.subplot(121)
title = "Original feature space"
pylab.title(title)
pylab.xlabel("$X_1$")
pylab.ylabel("$X_2$")
x1 = np.arange(0, 10, .2)
x2 = x1 + np.random.normal(loc=0, scale=1, size=len(x1))
good = (x1 > 5) | (x2 > 5)
bad = ~good
x1g = x1[good]
x2g = x2[good]
pylab.scatter(x1g, x2g, edgecolor="blue", facecolor="blue")
x1b = x1[bad]
x2b = x2[bad]
pylab.scatter(x1b, x2b, edgecolor="red", facecolor="white")
pylab.grid(True)
pylab.subplot(122)
X = np.c_[(x1, x2)]
pca = decomposition.PCA(n_components=1) #PCA
Xtrans = pca.fit_transform(X)
Xg = Xtrans[good]
Xb = Xtrans[bad]
pylab.scatter(
Xg[:, 0], np.zeros(len(Xg)), edgecolor="blue", facecolor="blue")
pylab.scatter(
Xb[:, 0], np.zeros(len(Xb)), edgecolor="red", facecolor="white")
title = "Transformed feature space"
pylab.title(title)
pylab.xlabel("$X'$")
fig.axes[1].get_yaxis().set_visible(False)
print(pca.explained_variance_ratio_)
pylab.grid(True)
pylab.autoscale(tight=True)
filename = "pca_demo_1.png"
pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
def plot_simple_demo_2():
pylab.clf()
fig = pylab.figure(num=None, figsize=(10, 4))
pylab.subplot(121)
title = "Original feature space"
pylab.title(title)
pylab.xlabel("$X_1$")
pylab.ylabel("$X_2$")
x1 = np.arange(0, 10, .2)
x2 = x1 + np.random.normal(loc=0, scale=1, size=len(x1))
good = x1 > x2
bad = ~good
x1g = x1[good]
x2g = x2[good]
pylab.scatter(x1g, x2g, edgecolor="blue", facecolor="blue")
x1b = x1[bad]
x2b = x2[bad]
pylab.scatter(x1b, x2b, edgecolor="red", facecolor="white")
pylab.grid(True)
pylab.subplot(122)
X = np.c_[(x1, x2)]
pca = decomposition.PCA(n_components=1)
Xtrans = pca.fit_transform(X)
Xg = Xtrans[good]
Xb = Xtrans[bad]
pylab.scatter(
Xg[:, 0], np.zeros(len(Xg)), edgecolor="blue", facecolor="blue")
pylab.scatter(
Xb[:, 0], np.zeros(len(Xb)), edgecolor="red", facecolor="white")
title = "Transformed feature space"
pylab.title(title)
pylab.xlabel("$X'$")
fig.axes[1].get_yaxis().set_visible(False)
print(pca.explained_variance_ratio_)
pylab.grid(True)
pylab.autoscale(tight=True)
filename = "pca_demo_2.png"
pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
def plot_simple_demo_lda():
pylab.clf()
fig = pylab.figure(num=None, figsize=(10, 4))
pylab.subplot(121)
title = "Original feature space"
pylab.title(title)
pylab.xlabel("$X_1$")
pylab.ylabel("$X_2$")
good = x1 > x2
bad = ~good
x1g = x1[good]
x2g = x2[good]
pylab.scatter(x1g, x2g, edgecolor="blue", facecolor="blue")
x1b = x1[bad]
x2b = x2[bad]
pylab.scatter(x1b, x2b, edgecolor="red", facecolor="white")
pylab.grid(True)
pylab.subplot(122)
X = np.c_[(x1, x2)]
lda_inst = lda.LDA(n_components=1) #LDA 方法
Xtrans = lda_inst.fit_transform(X, good)
Xg = Xtrans[good]
Xb = Xtrans[bad]
pylab.scatter(
Xg[:, 0], np.zeros(len(Xg)), edgecolor="blue", facecolor="blue")
pylab.scatter(
Xb[:, 0], np.zeros(len(Xb)), edgecolor="red", facecolor="white")
title = "Transformed feature space"
pylab.title(title)
pylab.xlabel("$X'$")
fig.axes[1].get_yaxis().set_visible(False)
pylab.grid(True)
pylab.autoscale(tight=True)
filename = "lda_demo.png"
pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
if __name__ == '__main__':
plot_simple_demo_1()
plot_simple_demo_2()
plot_simple_demo_lda()
import os
import numpy as np
from matplotlib import pylab
from mpl_toolkits.mplot3d import Axes3D
from sklearn import linear_model, manifold, decomposition, datasets #MDA在manifold包中
logistic = linear_model.LogisticRegression()
CHART_DIR = os.path.join("..", "charts")
np.random.seed(3)
# all examples will have three classes in this file
colors = ['r', 'g', 'b']
markers = ['o', 6, '*']
def plot_demo_1():
X = np.c_[np.ones(5), 2 * np.ones(5), 10 * np.ones(5)].T ##np.c_ Translates slice objects to concatenation along the second axis, 和vstock等有类似又有区分
y = np.array([0, 1, 2])
fig = pylab.figure(figsize=(10, 4))
ax = fig.add_subplot(121, projection='3d')
ax.set_axis_bgcolor('white') #set_axis_bgcolor
mds = manifold.MDS(n_components=3) #MDS 分析方法
Xtrans = mds.fit_transform(X)
for cl, color, marker in zip(np.unique(y), colors, markers):
ax.scatter(
Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], Xtrans[y == cl][:, 2], c=color, marker=marker, edgecolor='black')
pylab.title("MDS on example data set in 3 dimensions")
ax.view_init(10, -15)
mds = manifold.MDS(n_components=2)
Xtrans = mds.fit_transform(X)
ax = fig.add_subplot(122)
for cl, color, marker in zip(np.unique(y), colors, markers):
ax.scatter(
Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], c=color, marker=marker, edgecolor='black')
pylab.title("MDS on example data set in 2 dimensions")
filename = "mds_demo_1.png"
pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
def plot_iris_mds():
iris = datasets.load_iris()
X = iris.data
y = iris.target
# MDS
fig = pylab.figure(figsize=(10, 4))
ax = fig.add_subplot(121, projection='3d')
ax.set_axis_bgcolor('white')
mds = manifold.MDS(n_components=3)
Xtrans = mds.fit_transform(X)
for cl, color, marker in zip(np.unique(y), colors, markers):
ax.scatter(
Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], Xtrans[y == cl][:, 2], c=color, marker=marker, edgecolor='black')
pylab.title("MDS on Iris data set in 3 dimensions")
ax.view_init(10, -15)
mds = manifold.MDS(n_components=2)
Xtrans = mds.fit_transform(X)
ax = fig.add_subplot(122)
for cl, color, marker in zip(np.unique(y), colors, markers):
ax.scatter(
Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], c=color, marker=marker, edgecolor='black')
pylab.title("MDS on Iris data set in 2 dimensions")
filename = "mds_demo_iris.png"
pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
# PCA
fig = pylab.figure(figsize=(10, 4))
ax = fig.add_subplot(121, projection='3d')
ax.set_axis_bgcolor('white')
pca = decomposition.PCA(n_components=3)
Xtrans = pca.fit(X).transform(X)
for cl, color, marker in zip(np.unique(y), colors, markers):
ax.scatter(
Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], Xtrans[y == cl][:, 2], c=color, marker=marker, edgecolor='black')
pylab.title("PCA on Iris data set in 3 dimensions")
ax.view_init(50, -35)
pca = decomposition.PCA(n_components=2)
Xtrans = pca.fit_transform(X)
ax = fig.add_subplot(122)
for cl, color, marker in zip(np.unique(y), colors, markers):
ax.scatter(
Xtrans[y == cl][:, 0], Xtrans[y == cl][:, 1], c=color, marker=marker, edgecolor='black')
pylab.title("PCA on Iris data set in 2 dimensions")
filename = "pca_demo_iris.png"
pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
if __name__ == '__main__':
plot_demo_1()
plot_iris_mds()
第十二章
from jug import TaskGenerator #大数据分布式平台运算
from time import sleep
@TaskGenerator
def double(x):
sleep(4)
return 2*x
@TaskGenerator
def add(a, b):
return a + b
@TaskGenerator
def print_final_result(oname, value):
with open(oname, 'w') as output:
print >>output, "Final result:", value
input = 2
y = double(input)
z = double(y)
y2 = double(7)
z2 = double(y2)
print_final_result('output.txt', add(z,z2))