加载示例数据集
from __future__ import print_function
from sklearn import datasets
print filter(lambda s: s.startswith('load_'), dir(datasets))
'''
['load_boston', 'load_breast_cancer', 'load_diabetes', 'load_digits', 'load_files', 'load_iris', 'load_lfw_pairs', 'load_lfw_people', 'load_linnerud', 'load_mlcomp', 'load_sample_image', 'load_sample_images', 'load_svmlight_file', 'load_svmlight_files']
'''
boston_prices = datasets.load_boston()
print("Data shape", boston_prices.data.shape)
print("Data max=%s min=%s" % (boston_prices.data.max(), boston_prices. data.min()))
print("Target shape", boston_prices.target.shape)
print("Target max=%s min=%s" % (boston_prices.target.max(), boston_ prices.target.min()))
道琼斯股票聚类
start = datetime.datetime(2011, 01, 01)
end = datetime.datetime(2012, 01, 01)
symbols = ["AA", "AXP", "BA", "BAC", "CAT",
"CSCO", "CVX", "DD", "DIS", "GE", "HD",
"HPQ", "IBM", "INTC", "JNJ", "JPM",
"KO", "MCD", "MMM", "MRK", "MSFT", "PFE",
"PG", "T", "TRV", "UTX", "VZ", "WMT", "XOM"]
quotes = []
for symbol in symbols:
try:
quotes.append(finance.quotes_historical_yahoo(symbol, start, end, asobject=True))
except urllib2.HTTPError as e:
print(symbol, e)
close = np.array([q.close for q in quotes]).astype(np.float)
print(close.shape)
logreturns = np.diff(np.log(close))
print(logreturns.shape)
logreturns_norms = np.sum(logreturns ** 2, axis=1)
S = - logreturns_norms[:, np.newaxis] - logreturns_norms[np. newaxis, :] + 2 * np.dot(logreturns, logreturns.T)
aff_pro = sklearn.cluster.AffinityPropagation().fit(S)
labels = aff_pro.labels_
for symbol, label in zip(symbols, labels):
print('%s in Cluster %d' % (symbol, label))
'''
AA in Cluster 0
AXP in Cluster 6
BA in Cluster 6
BAC in Cluster 1
CAT in Cluster 6
CSCO in Cluster 2
CVX in Cluster 7
DD in Cluster 6
DIS in Cluster 6
GE in Cluster 6
HD in Cluster 5
HPQ in Cluster 3
IBM in Cluster 5
INTC in Cluster 6
JNJ in Cluster 5
JPM in Cluster 4
KO in Cluster 5
MCD in Cluster 5
MMM in Cluster 6
MRK in Cluster 5
MSFT in Cluster 5
PFE in Cluster 7
PG in Cluster 5
T in Cluster 5
TRV in Cluster 5
UTX in Cluster 6
VZ in Cluster 5
WMT in Cluster 5
XOM in Cluster 7
使用 statsmodels 执行正态性测试
from __future__ import print_function
import datetime
import numpy as np
from matplotlib import finance
from statsmodels.stats.adnorm import normal_ad
start = datetime.datetime(2011, 01, 01)
end = datetime.datetime(2012, 01, 01)
quotes = finance.quotes_historical_yahoo('AAPL', start, end, asobject=True)
close = np.array(quotes.close).astype(np.float)
print(close.shape)
print(normal_ad(np.diff(np.log(close))))
角点检测
from skimage.feature import corner_peaks
from skimage.color import rgb2gray
dataset = load_sample_images()
img = dataset.images[0]
gray_img = rgb2gray(img)
harris_coords = corner_peaks(corner_harris(gray_img))
y, x = np.transpose(harris_coords)
plt.axis('off')
plt.imshow(img)
plt.plot(x, y, 'ro')
plt.show()
边界检测
from sklearn.datasets import load_sample_images
import matplotlib.pyplot as plt
import skimage.feature
dataset = load_sample_images()
img = dataset.images[0]
edges = skimage.feature.canny(img[..., 0])
plt.axis('off')
plt.imshow(edges)
plt.show()