把之前的自己写的笔记云保存一下
pandas库 | 读数据 | import pandas as pd pd.read_csv('breast-cancer-wisconsin.data') |
|||
numpy库 | 矩阵计算 | import numpy as np X = np.array(df.drop(['class'], 1)) Y = np.array(df['class']) |
|||
knn | 使用scikit-learn中k邻近算法 neighbors.KNeighborsClassifier() | from sklearn import cross_validation, neighbors X_trian,X_test,Y_train,Y_test = cross_validation.train_test_split(X, Y, test_size=0.2) clf = neighbors.KNeighborsClassifier() clf.fit(X_trian, Y_train) |
|||
math库 | 计算log | from math import log | |||
decision tree 决策树分类 |
rom sklearn import tree tree.DecisionTreeClassifier() |
>>> from sklearn import tree >>> X = [[0, 0], [1, 1]] >>> Y = [0, 1] >>> clf = tree.DecisionTreeClassifier() >>> clf = clf.fit(X, Y) |
|||
matplotlib库 | 画图 | import matplotlib.pyplot as plt import numpy as np np.random.seed(19680801) data = np.random.randn(2, 100) fig, axs = plt.subplots(2, 2, figsize=(5, 5)) plt.show() |
|||
朴素bayes | from sklearn import datasets | >>> from sklearn import datasets >>> iris = datasets.load_iris() >>> from sklearn.naive_bayes import GaussianNB >>> gnb = GaussianNB() >>> y_pred = gnb.fit(iris.data, iris.target).predict(iris.data) |
|||
logistic回归 | from sklearn.linear_model import LogisticRegression clf_l1_LR = LogisticRegression(C=C, penalty='l1', tol=0.01) |
from sklearn.linear_model import LogisticRegression from sklearn import datasets from sklearn.preprocessing import StandardScaler digits = datasets.load_digits() X, y = digits.data, digits.target X = StandardScaler().fit_transform(X) for i, C in enumerate((100, 1, 0.01)): clf_l1_LR = LogisticRegression(C=C, penalty='l1', tol=0.01) clf_l2_LR = LogisticRegression(C=C, penalty='l2', tol=0.01) clf_l1_LR.fit(X, y) clf_l2_LR.fit(X, y) |
|||
svm | from sklearn import svm | >>> from sklearn import svm >>> X = [[0, 0], [1, 1]] >>> y = [0, 1] >>> clf = svm.SVC() >>> clf.fit(X, y) SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) |
|||
bagging | 还包含knn | >>> from sklearn.ensemble import BaggingClassifier >>> from sklearn.neighbors import KNeighborsClassifier >>> bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5) |
|||
random forest | from sklearn.ensemble import RandomForestClassifier | >>> from sklearn.ensemble import RandomForestClassifier >>> X = [[0, 0], [1, 1]] >>> Y = [0, 1] >>> clf = RandomForestClassifier(n_estimators=10) >>> clf = clf.fit(X, Y) |
|||
adaboost | from sklearn.ensemble import AdaBoostClassifier | >>> from sklearn.model_selection import cross_val_score >>> from sklearn.datasets import load_iris >>> from sklearn.ensemble import AdaBoostClassifier >>> iris = load_iris() >>> clf = AdaBoostClassifier(n_estimators=100) >>> scores = cross_val_score(clf, iris.data, iris.target) >>> scores.mean() 0.9... |
|||
回归 | import regression from sklearn import linear_model |
from sklearn.model_selection import cross_val_predict from sklearn import linear_model lr = linear_model.LinearRegression() boston = datasets.load_boston() y = boston.target predicted = cross_val_predict(lr, boston.data, y, cv=10) |
|||
ridgr回归 | from sklearn import linear_mode reg = linear_model.Ridge (alpha = .5) |
>>> from sklearn import linear_model >>> reg = linear_model.Ridge (alpha = .5) >>> reg.fit ([[0, 0], [0, 0], [1, 1]], [0, .1, 1]) Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None, normalize=False, random_state=None, solver='auto', tol=0.001) >>> reg.coef_ array([ 0.34545455, 0.34545455]) >>> reg.intercept_ 0.13636.. |
|||
lasso | from sklearn import linear_model reg = linear_model.Lasso(alpha = 0.1) |
>>> from sklearn import linear_model >>> reg = linear_model.Lasso(alpha = 0.1) >>> reg.fit([[0, 0], [1, 1]], [0, 1]) Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000, normalize=False, positive=False, precompute=False, random_state=None, selection='cyclic', tol=0.0001, warm_start=False) >>> reg.predict([[1, 1]]) array([ 0.8]) |
|||
决策树回归 | from sklearn import tree clf = tree.DecisionTreeRegressor() |
>>> from sklearn import tree >>> X = [[0, 0], [2, 2]] >>> y = [0.5, 2.5] >>> clf = tree.DecisionTreeRegressor() >>> clf = clf.fit(X, y) >>> clf.predict([[1, 1]]) array([ 0.5]) |
|||
k-means | from sklearn.cluster import Kmeans KMeans(n_clusters=2, random_state=0).fit(X) |
>>> from sklearn.cluster import KMeans >>> import numpy as np >>> X = np.array([[1, 2], [1, 4], [1, 0], ... [4, 2], [4, 4], [4, 0]]) >>> kmeans = KMeans(n_clusters=2, random_state=0).fit(X) >>> kmeans.labels_ array([0, 0, 0, 1, 1, 1], dtype=int32) >>> kmeans.predict([[0, 0], [4, 4]]) array([0, 1], dtype=int32) >>> kmeans.cluster_centers_ array([[ 1., 2.], [ 4., 2.]]) |
|||
关联分析 apriori算法 |
from apyori import apriori | from apyori import apriori transactions = [['beer', 'nuts'],['beer', 'cheese'],] results = list(apriori(transactions)) |
|||
频繁项集 FP-growth算法 |
自己构造 | ||||
pca | from sklearn.decomposition import PCA PCA(n_components=2) |
>>> import numpy as np >>> from sklearn.decomposition import PCA >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) >>> pca = PCA(n_components=2) >>> pca.fit(X) PCA(copy=True, iterated_power='auto', n_components=2, random_state=None, svd_solver='auto', tol=0.0, whiten=False) |
|||
svd | from numpy import * U, Sigma, VT=linalg.svd([[1,1],[7,7]]) |
||||
sys库:操作文件目录 | import sys sys.argv 命令行参数List,第一个元素是程序本身路径 sys.modules.keys() 返回所有已经导入的模块列表 sys.modules 返回系统导入的模块字段,key是模块名,value是模块 sys.path 返回模块的搜索路径,初始化时使用PYTHONPATH环境变量的值 sys.platform 返回操作系统平台名称 |
||||
os库:针对系统环境的交互 | import os os.remove() 删除文件 os.rename() 重命名文件 os.walk() 生成目录树下的所有文件名 os.path.basename() 去掉目录路径,返回文件名 os.path.dirname() 去掉文件名,返回目录路径 |
||||
使用Python实现Hadoop MapReduce程序 | http://blog.csdn.net/zhaoyl03/article/details/8657031 |