机器学习实战python代码总结

 

把之前的自己写的笔记云保存一下

pandas库 读数据 import pandas as pd
pd.read_csv('breast-cancer-wisconsin.data')
numpy库 矩阵计算 import numpy as np
X = np.array(df.drop(['class'], 1))
Y = np.array(df['class'])
knn 使用scikit-learn中k邻近算法 neighbors.KNeighborsClassifier() from sklearn import cross_validation, neighbors
X_trian,X_test,Y_train,Y_test = cross_validation.train_test_split(X, Y, test_size=0.2)
clf = neighbors.KNeighborsClassifier()
clf.fit(X_trian, Y_train)
 
math库 计算log from math import log
decision tree
决策树分类
rom sklearn import tree
tree.DecisionTreeClassifier()
>>> from sklearn import tree
>>> X = [[0, 0], [1, 1]]
>>> Y = [0, 1]
>>> clf = tree.DecisionTreeClassifier()
>>> clf = clf.fit(X, Y)
 
matplotlib库 画图 import matplotlib.pyplot as plt
import numpy as np
np.random.seed(19680801)
data = np.random.randn(2, 100)
fig, axs = plt.subplots(2, 2, figsize=(5, 5))
plt.show()
朴素bayes from sklearn import datasets >>> from sklearn import datasets
>>> iris = datasets.load_iris()
>>> from sklearn.naive_bayes import GaussianNB
>>> gnb = GaussianNB()
>>> y_pred = gnb.fit(iris.data, iris.target).predict(iris.data)
logistic回归 from sklearn.linear_model import LogisticRegression

clf_l1_LR = LogisticRegression(C=C, penalty='l1', tol=0.01)
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
digits = datasets.load_digits()
X, y = digits.data, digits.target
X = StandardScaler().fit_transform(X)
for i, C in enumerate((100, 1, 0.01)):
    clf_l1_LR = LogisticRegression(C=C, penalty='l1', tol=0.01)
    clf_l2_LR = LogisticRegression(C=C, penalty='l2', tol=0.01)
    clf_l1_LR.fit(X, y)
    clf_l2_LR.fit(X, y)
svm from sklearn import svm >>> from sklearn import svm
>>> X = [[0, 0], [1, 1]]
>>> y = [0, 1]
>>> clf = svm.SVC()
>>> clf.fit(X, y) 
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
bagging 还包含knn >>> from sklearn.ensemble import BaggingClassifier
>>> from sklearn.neighbors import KNeighborsClassifier
>>> bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5)
random forest from sklearn.ensemble import RandomForestClassifier >>> from sklearn.ensemble import RandomForestClassifier
>>> X = [[0, 0], [1, 1]]
>>> Y = [0, 1]
>>> clf = RandomForestClassifier(n_estimators=10)
>>> clf = clf.fit(X, Y)
adaboost from sklearn.ensemble import AdaBoostClassifier >>> from sklearn.model_selection import cross_val_score
>>> from sklearn.datasets import load_iris
>>> from sklearn.ensemble import AdaBoostClassifier
>>> iris = load_iris()
>>> clf = AdaBoostClassifier(n_estimators=100)
>>> scores = cross_val_score(clf, iris.data, iris.target)
>>> scores.mean()                            
0.9...
回归 import regression
from sklearn import linear_model
from sklearn.model_selection import cross_val_predict
from sklearn import linear_model
lr = linear_model.LinearRegression()
boston = datasets.load_boston()
y = boston.target
predicted = cross_val_predict(lr, boston.data, y, cv=10)
ridgr回归 from sklearn import linear_mode
reg = linear_model.Ridge (alpha = .5)
>>> from sklearn import linear_model
>>> reg = linear_model.Ridge (alpha = .5)
>>> reg.fit ([[0, 0], [0, 0], [1, 1]], [0, .1, 1])
Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)
>>> reg.coef_
array([ 0.34545455,  0.34545455])
>>> reg.intercept_
0.13636..
lasso from sklearn import linear_model
reg = linear_model.Lasso(alpha = 0.1)
>>> from sklearn import linear_model
>>> reg = linear_model.Lasso(alpha = 0.1)
>>> reg.fit([[0, 0], [1, 1]], [0, 1])
Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)
>>> reg.predict([[1, 1]])
array([ 0.8])
决策树回归 from sklearn import tree
clf = tree.DecisionTreeRegressor()
>>> from sklearn import tree
>>> X = [[0, 0], [2, 2]]
>>> y = [0.5, 2.5]
>>> clf = tree.DecisionTreeRegressor()
>>> clf = clf.fit(X, y)
>>> clf.predict([[1, 1]])
array([ 0.5])
k-means from sklearn.cluster import Kmeans
KMeans(n_clusters=2, random_state=0).fit(X)
>>> from sklearn.cluster import KMeans
>>> import numpy as np
>>> X = np.array([[1, 2], [1, 4], [1, 0],
...               [4, 2], [4, 4], [4, 0]])
>>> kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
>>> kmeans.labels_
array([0, 0, 0, 1, 1, 1], dtype=int32)
>>> kmeans.predict([[0, 0], [4, 4]])
array([0, 1], dtype=int32)
>>> kmeans.cluster_centers_
array([[ 1.,  2.], [ 4.,  2.]])
关联分析
apriori算法
from apyori import apriori from apyori import apriori
transactions = [['beer', 'nuts'],['beer', 'cheese'],]
results = list(apriori(transactions))
频繁项集
FP-growth算法
自己构造
pca from sklearn.decomposition import PCA
PCA(n_components=2)
>>> import numpy as np
>>> from sklearn.decomposition import PCA
>>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
>>> pca = PCA(n_components=2)
>>> pca.fit(X)
PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)
svd from numpy import *
U, Sigma, VT=linalg.svd([[1,1],[7,7]])
sys库:操作文件目录 import sys
sys.argv 命令行参数List,第一个元素是程序本身路径
sys.modules.keys() 返回所有已经导入的模块列表
sys.modules 返回系统导入的模块字段,key是模块名,value是模块
sys.path 返回模块的搜索路径,初始化时使用PYTHONPATH环境变量的值
sys.platform 返回操作系统平台名称
os库:针对系统环境的交互 import os
os.remove() 删除文件
os.rename() 重命名文件
os.walk() 生成目录树下的所有文件名
os.path.basename() 去掉目录路径,返回文件名
os.path.dirname() 去掉文件名,返回目录路径
使用Python实现Hadoop MapReduce程序 http://blog.csdn.net/zhaoyl03/article/details/8657031

 

你可能感兴趣的:(笔记)