Sany 何灿

预测房价 | 实战（一）

1.设置

确保matplotlib工作和写函数来保存图片

保证python2和python3的可以使用
from __future__ import division, print_function, unicode_literals

import numpy as np
import os

# 确保notebook运行时结果稳定
np.random.seed(42)

# 修改默认属性
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# 设置储存图片路径
PROJECT_ROOT_DIR = "."
CHAPETER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
	path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
	print("Saving figure", fig_id)
	if tight_layout:
	plt.tight_layout()
	plt.savefig(path, format=fig_extension, dpi=resolution)

# 忽略没用的警告
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

2. 获取数据

import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH)
	os.makedirs(housing_path, exist_ok=True) # 创建目录
	tgz_path = os.path.join(housing_path, "housing.tgz") 
	urllib.request.urlretrieve(housing_url, tgz_path) # 从远程获取文件放入指定文件
	housing_tgz = tarfile.open(tgz_path)
	housing_tgz.extractall(path=housing_path) # 解压文件到当前目录
	housing_tgz.close()

fetch_housing_data()

import pandas as pd
def load_housing_data(housing_path=HOUSING_PATH):
	csv_path = os.path.join(housing_path, "housing.csv")
	return pd.read_csv(csv_path)

housing = load_housing_data()
housing.head()

housing.info()

housing["ocean_proximity"].value_counts()

housing.describe()

%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
save_fig("attribute_histogram_plots")
plt.show()

# 使每次运行时输出的结果保持一致
np.random.seed(42)

import numpy as np
def split_train_test(data, test_ratio):
	shuffled_indices = np.random.permutation(len(data))
	test_set_size = int(len(data) * test_ratio)
	test_indices = shuffled_indices[:test_set_size]
	train_indices = shuffled_indices[test_set_size:]
	return data.iloc[train_indices], data.iloc[test_indices]

train_set, test_set = split_train_test(housing, 0.2)
print(len(train_set), "train +", len(test_set), "test")

from zlib import crc32
def test_set_check(identifier, test_ratio):
	return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32

def split_train_test_by_id(data, test_ratio, id_column):
	ids = data[id_column]
	in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
	return data.loc[~in_test_set], data.loc[in_test_set]

import hashlib
def test_set_check(identifier, test_ratio, hash=hashlib.md5):
	return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio

def test_set_check(identifier, test_ratio, hash=hashlib.md5):
	return bytearray(hash(np.int64(identifier)).digest())[-1] < 256 * test_ratio

housing_with_id = housing.reset_index()   # adds an `index` column
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")

housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"]
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "id")

test_set.head()

from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

test_set.head()

housing["median_income"].hist()

# 将数据分成五个区间，并且标签
housing["income_cat"] = pd.cut(housing["median_income"],bins=[0., 1.5, 3.0, 4.5, 6., np.inf],labels=[1, 2, 3, 4, 5])

housing["income_cat"].value_counts()

using["income_cat"].hist()

# 交叉验证
from sklearn.model_selection import StratifiedShuffleSplit
for train_index, test_index in split.split(housing, housing["income_cat"]):
	strat_train_set = housing.loc[train_index]
	strat_test_set = housing.loc[test_index]

strat_test_set["income_cat"].value_counts() / len(strat_test_set)

housing["income_cat"].value_counts() / len(housing)

def income_cat_proportions(data):
	return data["income_cat"].value_counts() / len(data)

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

compare_props = pd.DataFrame({
	"Overall": income_cat_proportions(housing),
	"Stratified": income_cat_proportions(strat_test_set),
	"Random": income_cat_proportions(test_set),
}).sort_index()
compare_props["Rand. %error"] = 100 * compare_props["Random"] / compare_props["Overall"] - 100
compare_props["Strat. %error"] = 100 * compare_props["Stratified"] / compare_props["Overall"] - 100

compare_props

for set_ in (strat_train_set, strat_test_set):
	set_.drop("income_cat", axis=1, inplace=True)

3. 可视化数据，探索联系

housing = strat_train_set.copy()

housing.plot(kind="scatter", x="longitude", y="latitude")
save_fig("bad_visualization_plot")

housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)
save_fig("better_visualization_plot")

housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4, s=housing["population"]/100, label="population", figsize=(10,7), c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True, sharex=False)

plt.legend()
save_fig("housing_prices_scatterplot")

import matplotlib.image as mpimg
california_img = mpimg.imread(PROJECT_ROOT_DIR + '/images/end_to_end_project/california.png')
ax = housing.plot(kind="scatter", x="longitude", y="latitude", figsize=(10,7), s=housing['population']/100, label="Population", c="median_house_value", cmap=plt.get_cmap("jet"),  colorbar=False, alpha=0.4,)
plt.imshow(california_img, extent=[-124.55, -113.80, 32.45, 42.05], alpha=0.5, cmap=plt.get_cmap("jet"))
plt.ylabel("Latitude", fontsize=14)
plt.xlabel("Longitude", fontsize=14)

prices = housing["median_house_value"]
tick_values = np.linspace(prices.min(), prices.max(), 11)
cbar = plt.colorbar()
cbar.ax.set_yticklabels(["$%dk"%(round(v/1000)) for v in tick_values], fontsize=14)
cbar.set_label('Median House Value', fontsize=16)

plt.legend(fontsize=16)
save_fig("california_housing_prices_plot")
plt.show()

corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

from pandas.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))
save_fig("scatter_matrix_plot")

housing.plot(kind="scatter", x="median_income", y="median_house_value",alpha=0.1)
plt.axis([0, 16, 0, 550000])
save_fig("income_vs_house_value_scatterplot")

housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

housing.plot(kind="scatter", x="rooms_per_household", y="median_house_value", alpha=0.2)
plt.axis([0, 5, 0, 520000])
plt.show()

housing.describe()

4. 预处理数据，为机器学习算法做准备

housing = strat_train_set.drop("median_house_value", axis=1) # 舍弃训练集中的标记
housing_labels = strat_train_set["median_house_value"].copy()

sample_incomplete_rows = housing[housing.isnull().any(axis=1)].head()
sample_incomplete_rows

sample_incomplete_rows.dropna(subset=["total_bedrooms"]) # option 1
sample_incomplete_rows.drop("total_bedrooms", axis=1) # option 2

median = housing["total_bedrooms"].median()
sample_incomplete_rows["total_bedrooms"].fillna(median, inplace=True) # option 3
sample_incomplete_rows

try:
	from sklearn.impute import SimpleImputer # Scikit-Learn 0.20+
except ImportError:
	from sklearn.preprocessing import Imputer as SimpleImputer

imputer = SimpleImputer(strategy="median")

## 要先移除文本属性的值，再插值
housing_num = housing.drop('ocean_proximity', axis=1)
# alternatively: housing_num = housing.select_dtypes(include=[np.number])

# 计算每一列的中间值
imputer.fit(housing_num)

# 显示计算的每一列的中间值
imputer.statistics_

X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing.index)

housing_tr.loc[sample_incomplete_rows.index.values]

imputer.strategy

housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index)
housing_tr.head()

housing_cat = housing[['ocean_proximity']]
housing_cat.head(10)

try:
	from sklearn.preprocessing import OrdinalEncoder
except ImportError:
	from future_encoders import OrdinalEncoder # Scikit-Learn < 0.20

ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]

ordinal_encoder.categories_

try:
	from sklearn.preprocessing import OrdinalEncoder
	from sklearn.preprocessing import OneHotEncoder
except ImportError:
	from future_encoders import OneHotEncoder

cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

housing_cat_1hot.toarray()

cat_encoder = OneHotEncoder(sparse=False)
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

cat_encoder.categories_

housing.columns

from sklearn.base import BaseEstimator, TransformerMixin

# 获得指定列名的下标
rooms_ix, bedrooms_ix, population_ix, household_ix = [list(housing.columns).index(col) for col in ("total_rooms", "total_bedrooms", "population", "households")]

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
	# 为原数据集添加新的columns，表征新的属性
	def __init__(self, add_bedrooms_per_room = True): # no *args or **kwargs
		self.add_bedrooms_per_room = add_bedrooms_per_room
	def fit(self, X, y=None):
		return self
	def transform(self, X, y=None):
		rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
		population_per_household = X[:, population_ix] / X[:, household_ix]
		if self.add_bedrooms_per_room:
			bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
			return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
		else:
			return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

# 另一种方式构建添加新属性的函数
from sklearn.preprocessing import FunctionTransformer

def add_extra_features(X, add_bedrooms_per_room=True):
	rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
	population_per_household = X[:, population_ix] / X[:, household_ix]
	if add_bedrooms_per_room:
		bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
		return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
	else:
		return np.c_[X, rooms_per_household, population_per_household]
attr_adder = FunctionTransformer(add_extra_features, validate=False,kw_args={"add_bedrooms_per_room": False})
housing_extra_attribs = attr_adder.fit_transform(housing.values)

housing_extra_attribs = pd.DataFrame(housing_extra_attribs,columns=list(housing.columns)+["rooms_per_household", "population_per_household"], index=housing.index)
housing_extra_attribs.head()

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# 用来预处理数值属性的pipeline
num_pipeline = Pipeline([('imputer', SimpleImputer(strategy="median")),
						('attribs_adder', FunctionTransformer(add_extra_features, validate=False)),
						('std_scaler', StandardScaler()),
						])

housing_num_tr = num_pipeline.fit_transform(housing_num)
housing_num_tr

try:
	from sklearn.compose import ColumnTransformer
except ImportError:
	from future_encoders import ColumnTransformer # Scikit-Learn < 0.20

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([
		("num", num_pipeline, num_attribs),
		("cat", OneHotEncoder(), cat_attribs),
	])

housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared

housing_prepared.shape

# 另一种构建处理类别和数值属性的pipeline方式
from sklearn.base import BaseEstimator, TransformerMixin

class OldDataFrameSelector(BaseEstimator, TransformerMixin):
	def __init__(self, attribute_names):
		self.attribute_names = attribute_names
	def fit(self, X, y=None):
		return self
	def transform(self, X):
		return X[self.attribute_names].values

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

old_num_pipeline = Pipeline([
		('selector', OldDataFrameSelector(num_attribs)),
		('imputer', SimpleImputer(strategy="median")),
		('attribs_adder', FunctionTransformer(add_extra_features, validate=False)),
		('std_scaler', StandardScaler()),
	])

old_cat_pipeline = Pipeline([
		('selector', OldDataFrameSelector(cat_attribs)),
		('cat_encoder', OneHotEncoder(sparse=False)),
	])


from sklearn.pipeline import FeatureUnion

old_full_pipeline = FeatureUnion(transformer_list=[
		 ("num_pipeline", old_num_pipeline),
		 ("cat_pipeline", old_cat_pipeline),
	])

old_housing_prepared = old_full_pipeline.fit_transform(housing)
old_housing_prepared

np.allclose(housing_prepared, old_housing_prepared)

5.选择和训练模型

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

print("Predictions:", lin_reg.predict(some_data_prepared))

print("Labels:", list(some_labels))

some_data_prepared

from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

from sklearn.metrics import mean_absolute_error

lin_mae = mean_absolute_error(housing_labels, housing_predictions)
lin_mae

from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(housing_prepared, housing_labels)

housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

6.调整模型参数

from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

def display_scores(scores):
	print("Scores:", scores)
	print("Mean:", scores.mean())
	print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)

lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=10, random_state=42)
forest_reg.fit(housing_prepared, housing_labels)

housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

from sklearn.model_selection import cross_val_score 

forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
pd.Series(np.sqrt(-scores)).describe()

from sklearn.svm import SVR

svm_reg = SVR(kernel="linear")
svm_reg.fit(housing_prepared, housing_labels)
housing_predictions = svm_reg.predict(housing_prepared)
svm_mse = mean_squared_error(housing_labels, housing_predictions)
svm_rmse = np.sqrt(svm_mse)
svm_rmse

from sklearn.model_selection import GridSearchCV
param_grid = [
	# 尝试12（3x4）组超参数组合
	{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
	# 设置bootstap为否，尝试6（2x3）组超参数组合
	{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
	
forest_reg = RandomForestRegressor(random_state=42)

# 总共训练（12+6）x 5=90轮
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)

grid_search.best_params_

grid_search.best_estimator_

cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
	print(np.sqrt(-mean_score), params)

pd.DataFrame(grid_search.cv_results_)

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
		'n_estimators': randint(low=1, high=200),
		'max_features': randint(low=1, high=8),
	}

forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs, n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(housing_prepared, housing_labels)

cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
print(np.sqrt(-mean_score), params)

feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

from scipy import stats

confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
mean = squared_errors.mean()
m = len(squared_errors)

np.sqrt(stats.t.interval(confidence, m - 1,
						loc=np.mean(squared_errors),
						scale=stats.sem(squared_errors)))

你可能感兴趣的:(机器学习实战)

《机器学习实战》——在python中使用Matplotlib注解绘制树形图哆啦AA梦 python 机器学习 python 机器学习
#encoding=utf-8#使用文本注解绘制树形图importmatplotlib.pyplotaspltdecisionNode=dict(boxstyle="sawtooth",fc="0.8")leafNode=dict(boxstyle="round4",fc="0.8")arrow_args=dict(arrowstyle="<-")#上面三行代码定义文本框和箭头格式#定义决策树决策
Python从0到100（六十一）：机器学习实战-实现客户细分是Dream呀 python 机器学习开发语言
前言：零基础学Python：Python从0到100最新最全教程。想做这件事情很久了，这次我更新了自己所写过的所有博客，汇集成了Python从0到100，共一百节课，帮助大家一个月时间里从零基础到学习Python基础语法、Python爬虫、Web开发、计算机视觉、机器学习、神经网络以及人工智能相关知识，成为学习学习和学业的先行者！欢迎大家订阅专栏：零基础学Python：Python从0到100最新
【数据挖掘实战】房价预测机器学习司猫白数据挖掘人工智能 python 机器学习
本次对kaggle中的入门级数据集，房价回归数据集进行数据挖掘，预测房屋价格。本人主页：机器学习司猫白机器学习专栏：机器学习实战PyTorch入门专栏：PyTorch入门深度学习实战：深度学习ok，话不多说，我们进入正题吧概述本次竞赛有79个解释变量（几乎）描述了爱荷华州艾姆斯住宅的各个方面，需要预测每套住宅的最终价格。数据集描述本次数据集已经上传，大家可以自行下载尝试文件说明train.csv-
【机器学习实战入门】使用OpenCV进行性别和年龄检测精通代码大仙数据挖掘深度学习 python 机器学习 python opencv 数据挖掘人工智能
GenderandAgeDetectionPython项目首先，向您介绍用于此高级Python项目的性别和年龄检测中的术语：什么是计算机视觉？计算机视觉是一门让计算机能够像人类一样观察和识别数字图像和视频的学科。它面临的挑战大多源于对生物视觉有限的了解。计算机视觉涉及获取、处理、分析和理解数字图像，旨在从现实世界中提取高维数据，从而生成可用来做决策的符号或数值信息。该过程通常包括物体识别、视频跟踪
【机器学习实战中阶】音乐流派分类-自动化分类不同音乐风格精通代码大仙数据挖掘深度学习 python 机器学习分类自动化人工智能数据挖掘深度学习
音乐流派分类–自动化分类不同音乐风格在本教程中，我们将开发一个深度学习项目，用于自动化地从音频文件中分类不同的音乐流派。我们将使用音频文件的频率域和时间域低级特征来分类这些音频文件。对于这个项目，我们需要一个具有相似大小和相似频率范围的音频曲目数据集。GTZAN流派分类数据集是音乐流派分类项目中最推荐的数据集，并且它是为了这个任务而收集的。音乐流派分类器模型音乐流派分类关于数据集：GTZAN流派收
【机器学习实战入门项目】基于机器学习的鸢尾花分类项目精通代码大仙数据挖掘 python 深度学习机器学习分类人工智能大数据数据挖掘算法 python
基于机器学习的鸢尾花分类项目介绍：本项目利用机器学习模型对鸢尾花进行分类。鸢尾花数据集是一个著名的机器学习数据集，包含三种类别的花朵：Setosa、Versicolor和Virginica，每种类别由四个特征描述：萼片长度、萼片宽度、花瓣长度和花瓣宽度。什么是机器学习？机器学习是关于从数据中学习预测或提取知识的过程。它是人工智能的一个子领域。机器学习算法基于样本数据（即训练数据）构建模型，并根据训
机器学习实战笔记5——线性判别分析绍少阿机器学习笔记可视化机器学习 python 人工智能
任务安排1、机器学习导论8、核方法2、KNN及其实现9、稀疏表示3、K-means聚类10、高斯混合模型4、主成分分析11、嵌入学习5、线性判别分析12、强化学习6、贝叶斯方法13、PageRank7、逻辑回归14、深度学习线性判别分析（LDA）Ⅰ核心思想对于同样一件事，站在不同的角度，我们往往会有不同的看法，而降维思想，亦是如此。同上节课一样，我们还是学习降维的算法，只是提供了一种新的角度，由上
机器学习实战----波士顿房价预测模型永远偷渡不了的非洲人机器学习机器学习 sklearn python
波士顿房价模型预测是一个回归问题，可以采用r2_score方法来作为评价指标。importnumpyasnpimportpandasaspdfromsklearn.metricsimportr2_score#从sklearn的数据库中导入波士顿房产数据fromsklearn.datasetsimportload_bostonfromsklearn.model_selectionimporttrai
python logistic模型_Python实践之逻辑回归（Logistic Regression） weixin_39922394 python logistic模型
机器学习算法与Python实践这个系列主要是参考《机器学习实战》这本书。因为自己想学习Python，然后也想对一些机器学习算法加深下了解，所以就想通过Python来实现几个比较常用的机器学习算法。恰好遇见这本同样定位的书籍，所以就参考这本书的过程来学习了。这节学习的是逻辑回归(LogisticRegression)，也算进入了比较正统的机器学习算法。啥叫正统呢？我概念里面机器学习算法一般是这样一个
(二十一)Seaborn知识学习8-python数据分析与机器学习实战(学习笔记) 努力奋斗的durian
文章原创,最近更新：2018-05-17课程来源:python数据分析与机器学习实战-唐宇迪引言:介绍seaborn热度图绘制学习参考链接:1、Seaborn官方0.8.1版本首先介绍以下热度图的作用,拿出离散群数据,离散群数据可能会发生波动变化.看一下哪个点的值比较高,看一下哪个点的值比较低?通过值的变化,用颜色表现出来,这个是我们要做的一件事.热度图是由不同的颜色构成的,这个颜色由可能是由浅入
机器学习实战2--蒙特卡洛方法与Q-Q图(2022/10/12) 点灯的棉羊机器学习Jupyter笔记机器学习人工智能 numpy python
蒙特卡洛方法与Q-Q图文章目录蒙特卡洛方法与Q-Q图蒙特卡洛方法蒙特卡洛的定义和基本步骤一些常用的概率论相关函数使用蒙特卡洛验证大数定理Q-Q图Q-Q图的定义及用途importnumpyasnpfromnumpy.linalgimportinv,eigimportmatplotlib.pyplotaspltimportpandasaspdfromscipy.statsimportnorm蒙特卡洛方
机器学习实战1-基础运用（2022/10/11）点灯的棉羊机器学习Jupyter笔记机器学习 python numpy
机器学习实战1-基础运用文章目录机器学习实战1-基础运用numpy的简单运用生成矩阵和矩阵的简单操作用pandas库读取、保存csv数据文件read_csv()函数及读入的数据处理to_csv()保存数据matplotlib.pyplot库绘图的使用条形图的绘制箱型图的绘制分位数（Quantile）分位点/四分位数分位数与箱型图`boxplot()`函数绘制交叉报表热力图plt绘图基础import
机器学习实战Jupyter笔记专栏汇总点灯的棉羊机器学习Jupyter笔记机器学习 jupyter 人工智能
机器学习实战Jupter笔记开始博客学校开始的一门机器学习的课程，于是使用jupyter写这门课的作业，顺便将其完善为笔记发表为这个专栏的博客，并将专栏博客链接汇总到这里。由于是刚开始学习机器学习方面的内容，如有错误的地方，希望能有大佬能帮忙指正。笔记1机器学习实战1-基础运用种一棵树最好的时间–是十年前，其次是现在
朴素贝叶斯算法 YuanDaima2048 机器学习算法学习算法机器学习人工智能深度学习 python sklearn
朴素贝叶斯算法一、基本概念二、算法及代码应用朴素贝叶斯NB算法分类算法区别其他机器学习算法：机器学习实战工具安装和使用一、基本概念朴素贝叶斯（NB）是一种基于贝叶斯定理与特征条件独立假设的分类算法。它被广泛应用于文本分类、垃圾邮件过滤等领域。朴素贝叶斯算法简单易懂，其核心思想是假设在给定目标值时，各个属性之间相互独立。在实际应用中，朴素贝叶斯算法在垃圾邮件过滤中表现出色。它不仅准确率高，而且速度快
【机器学习实战】大数据与MapReduce 吵吵人
当运算需求超出了当前资源的运算能力，一、可以考虑购买更好的机器；二、可以将计算转换成并行作业，MapReduce就提供了这种方案的一个具体实施框架。MapReduce：分布式计算的框架MapReduce是一个软件框架，可以将单个计算工作分配给多台计算机执行。工作流程包括map和reduce阶段。第一阶段，输入数据被切片分发到节点上，各个节点对本地数据进行处理对应的运算代码叫做mapper。第二阶段
[培训-Python机器学习]04-Git的使用和规范乱码奇糟软件开发 git
参考书Python机器学习实战作者裔隽张怿檬张目清出版社科学技术文献出版社难度入门安排计划：本章30分钟；作业：上网查阅Linus开发Git的背景；分析所在的开发团队所用的协作开发流程是什么？总结出Git使用和Git流程中遇到过的3个问题，发给大家讨论。非常有意思：2005年，由Linux的创始人LinusTorvalds开发；临危赴命，用时2周。分布式、本地管理、分支管理、提交机制Github、
[培训-Python机器学习]02-使用conda管理环境和包乱码奇糟软件开发 python conda
参考书Python机器学习实战作者裔隽张怿檬张目清出版社科学技术文献出版社难度入门安排计划：本章30分钟；作业：培训后实践本章的各种操作；结果：以Python3.10创建开发虚拟环境；再创建一个Python3.7版本以下的虚拟环境用来调试兼容性以前培训过venv，本次培训来说一说conda。conda其实可理解为：venv+pip，它的主要功能包括：环境管理：创建多个隔离的Python运行环境，每
机器学习（machine learning）大合集 AI信仰者
1、线性分类器怎么理解呢？我们可以把此分类器理解为线性空间的划分，最简单的，在二维空间上，通过直线的划分。第二个理解可以理解为模板匹配，W的每一行可以看做是其中一个类别的模板。每类得分，实际上是像素点和模板匹配度。模板匹配的方式是内积计算。2、机器学习实战之AdaBoost算法boosting算法系列的基本思想，如下图：adaBoost分类器就是一种元算法分类器，adaBoost分类器利用同一种基
机器学习实战朴素贝叶斯分类器 shenny_
基于概率论的分类方法：朴素贝叶斯我的微信公众号：s406205391;欢迎大家一起学习，一起进步！！！k-近邻算法和决策树会给出“该数据属于哪一类”的明确回答。不过，分类器有时会产生错误结果，这是可以要求分类器给出一个最优的类别的猜测结果，同事给出这个猜测的概率估计值。朴素贝叶斯就是一个概率分类器。我们称之为“朴素”，是因为整个形式化的过程只做最原始、最简单的假设。朴素贝叶斯的优点：在数据较少的情
《机器学习实战》笔记（十三）：Ch13 - 利用PCA来简化数据 Lornatang
第13章利用PCA来简化数据(代码)降维技术降维的意思是能够用一组个数为d的向量zi来代表个数为D的向量xi所包含的有用信息，其中d
Python实现时间序列分析马尔可夫切换自回归模型(MarkovAutoregression算法)项目实战胖哥真不错机器学习 python python 机器学习时间序列分析马尔可夫切换自回归模型项目实战
说明：这是一个机器学习实战项目（附带数据+代码+文档+视频讲解），如需数据+代码+文档+视频讲解可以直接到文章最后获取。1.项目背景时间序列分析中的马尔可夫切换自回归模型（MarkovSwitchingAutoregressionModel，简称MSAR或MarkovAutoregression算法）是一种混合了自回归模型（AutoregressiveModel,AR）和马尔可夫链（MarkovC
Python实现时间序列分析马尔可夫切换动态回归模型(MarkovRegression算法)项目实战胖哥真不错机器学习 python python 机器学习时间序列分析马尔可夫切换动态回归模型项目实战
说明：这是一个机器学习实战项目（附带数据+代码+文档+视频讲解），如需数据+代码+文档+视频讲解可以直接到文章最后获取。1.项目背景时间序列分析中的马尔可夫切换动态回归模型（MarkovSwitchingDynamicRegressionModel，MSDRM或简称为MarkovRegression算法）是一种用于处理具有非平稳性和隐藏状态依赖性的时序数据的方法。在该模型中，数据生成过程被认为是在
Python实现时间序列分析季节性自回归综合移动平均外生回归模型(SARIMAX算法)项目实战胖哥真不错机器学习 python python 时间序列分析季节性自回归综合移动平均外生回归模型 SARIMAX 项目实战
说明：这是一个机器学习实战项目（附带数据+代码+文档+视频讲解），如需数据+代码+文档+视频讲解可以直接到文章最后获取。1.项目背景时间序列分析中的季节性自回归综合移动平均外生回归模型（SeasonalAutoregressiveIntegratedMovingAveragewitheXogenousregressors,SARIMAX）是一种统计建模技术，用于分析和预测具有季节性、趋势以及可能受
Python实现时间序列分析AR定阶自回归模型(ar_select_order算法)项目实战胖哥真不错机器学习 python python 机器学习时间序列分析AR定阶自回归模型 ar_select_order 项目实战
说明：这是一个机器学习实战项目（附带数据+代码+文档+视频讲解），如需数据+代码+文档+视频讲解可以直接到文章最后获取。1.项目背景时间序列分析中，AR定阶自回归模型（ARorderselection）是指确定自回归模型（AutoRegressiveModel,AR模型）的阶数p的过程。在AR(p)模型中，当前的时间序列值被表示为过去p个时期的线性组合加上一个误差项。ar_select_order
python机器学习实战|机器学习入门笔记3-Pandas基础知识小赵同学871 机器学习实战入门笔记 python 机器学习 pandas
文章目录1.Pandas介绍2.案例知识点2.1创建DataFrame2.2创建日期3.DataFrame介绍3.1DataFrame属性3.2DataFrame设置索引3.3基本数据操作3.4DataFrame运算1.Pandas介绍开源的数据挖掘库，用于数据探索，封装了matplotlib，numpy2.案例知识点2.1创建DataFramepd.DataFrame(ndarray,index
Python实现离散选择概率模型(Probit算法)项目实战胖哥真不错机器学习 python python 离散选择概率模型 Probit算法机器学习项目实战
说明：这是一个机器学习实战项目（附带数据+代码+文档+视频讲解），如需数据+代码+文档+视频讲解可以直接到文章最后获取。1.项目背景Probit模型是经过Logit模型的形式经过变形后得到的，Probit模型假设与标准正态分布的概率分布函数相似。本项目通过Probit算法来构建概率模型。2.数据获取本次建模数据来源于网络(本项目撰写人整理而成)，数据项统计如下：编号变量名称描述1x12x23x34
机器学习实战 K-近邻算法今昔何夕丶
K-近邻算法优点：精度高、对异常值不敏感、无数据输入假定缺点：计算复杂高、空间复杂度高适用数据范围：数值型和标称型一般流程收集数据：可以使用任何方法准备数据：距离计算所需要的数值，最好是结构化的数据结构分析数据：可以使用任何方法训练算法：此步骤不适用于K-近邻算法测试算法：计算错误率使用算法：首先需要输入样本数据和结构化的输出结果，然后运行K-近邻算法判定输入数据分别属于哪个分类，最后应用对计算出
Python实现稳健线性回归模型(rlm算法)项目实战胖哥真不错机器学习 python python 机器学习稳健线性回归模型 rlm算法项目实战
说明：这是一个机器学习实战项目（附带数据+代码+文档+视频讲解），如需数据+代码+文档+视频讲解可以直接到文章最后获取。1.项目背景稳健回归可以用在任何使用最小二乘回归的情况下。在拟合最小二乘回归时，我们可能会发现一些异常值或高杠杆数据点。已经确定这些数据点不是数据输入错误，也不是来自另一个群落。所以我们没有令人信服的理由将它们排除在分析之外。稳健回归可能是一种好的策略，它是在将这些点完全从分析中
机器学习实战学习记录（github） monkeyhlj 学习
机器学习实战学习记录（github）可见我的github：https://github.com/monkeyhlj/machine_learning_bymyself刚刚建好，后面的学习记录会一直在这个仓库里面更新。推荐参考资料：https://www.zhihu.com/column/c_1242508311053963264
【机器学习实战】决策树吵吵人
算法思路在构造决策树时，第一个需要解决的问题就是，如何确定出哪个特征在划分数据分类是起决定性作用，或者说使用哪个特征分类能实现最好的分类效果。这样，为了找到决定性的特征，划分得到最好的结果，我们就需要评估每个特征。当找到最优特征后，依此特征，数据集就被划分为几个数据子集，这些数据自己会分布在该决策点的所有分支中。此时，如果某个分支下的数据属于同一类型，则该分支下的数据分类已经完成，无需进行下一步的
Maven Array_06 eclipse jdk maven
Maven Maven是基于项目对象模型(POM)，信息来管理项目的构建，报告和文档的软件项目管理工具。 Maven 除了以程序构建能力为特色之外，还提供高级项目管理工具。由于 Maven 的缺省构建规则有较高的可重用性，所以常常用两三行 Maven 构建脚本就可以构建简单的项目。由于 Maven 的面向项目的方法，许多 Apache Jakarta 项目发文时使用 Maven，而且公司
ibatis的queyrForList和queryForMap区别 bijian1013 java ibatis
一.说明 iBatis的返回值参数类型也有种：resultMap与resultClass，这两种类型的选择可以用两句话说明之： 1.当结果集列名和类的属性名完全相对应的时候，则可直接用resultClass直接指定查询结果类
LeetCode[位运算] - #191 计算汉明权重 Cwind java 位运算 LeetCode Algorithm 题解
原题链接：#191 Number of 1 Bits 要求：写一个函数，以一个无符号整数为参数，返回其汉明权重。例如，‘11’的二进制表示为'00000000000000000000000000001011', 故函数应当返回3。汉明权重：指一个字符串中非零字符的个数；对于二进制串，即其中‘1’的个数。难度：简单分析：将十进制参数转换为二进制，然后计算其中1的个数即可。 “
浅谈java类与对象 15700786134 java
java是一门面向对象的编程语言，类与对象是其最基本的概念。所谓对象，就是一个个具体的物体，一个人，一台电脑，都是对象。而类，就是对象的一种抽象，是多个对象具有的共性的一种集合，其中包含了属性与方法，就是属于该类的对象所具有的共性。当一个类创建了对象，这个对象就拥有了该类全部的属性，方法。相比于结构化的编程思路，面向对象更适用于人的思维
linux下双网卡同一个IP 被触发 linux
转自： http://q2482696735.blog.163.com/blog/static/250606077201569029441/ 由于需要一台机器有两个网卡，开始时设置在同一个网段的IP，发现数据总是从一个网卡发出，而另一个网卡上没有数据流动。网上找了下，发现相同的问题不少：一、关于双网卡设置同一网段IP然后连接交换机的时候出现的奇怪现象。当时没有怎么思考、以为是生成树
安卓按主页键隐藏程序之后无法再次打开肆无忌惮_ 安卓
遇到一个奇怪的问题，当SplashActivity跳转到MainActivity之后，按主页键，再去打开程序，程序没法再打开（闪一下），结束任务再开也是这样，只能卸载了再重装。而且每次在Log里都打印了这句话"进入主程序"。后来发现是必须跳转之后再finish掉SplashActivity 本来代码： // 销毁这个Activity fin
通过cookie保存并读取用户登录信息实例知了ing JavaScript html
通过cookie的getCookies()方法可获取所有cookie对象的集合；通过getName()方法可以获取指定的名称的cookie；通过getValue()方法获取到cookie对象的值。另外，将一个cookie对象发送到客户端，使用response对象的addCookie()方法。下面通过cookie保存并读取用户登录信息的例子加深一下理解。（1）创建index.jsp文件。在改
JAVA 对象池矮蛋蛋 java ObjectPool
原文地址： http://www.blogjava.net/baoyaer/articles/218460.html Jakarta对象池 ☆为什么使用对象池恰当地使用对象池化技术，可以有效地减少对象生成和初始化时的消耗，提高系统的运行效率。Jakarta Commons Pool组件提供了一整套用于实现对象池化
ArrayList根据条件+for循环批量删除的方法 alleni123 java
场景如下： ArrayList<Obj> list Obj-> createTime, sid. 现在要根据obj的createTime来进行定期清理。（释放内存） ------------------------- 首先想到的方法就是 for(Obj o:list){ if(o.createTime-currentT>xxx){
阿里巴巴“耕地宝”大战各种宝百合不是茶平台战略
“耕地保”平台是阿里巴巴和安徽农民共同推出的一个 “首个互联网定制私人农场”，“耕地宝”由阿里巴巴投入一亿，主要是用来进行农业方面，将农民手中的散地集中起来不仅加大农民集体在土地上面的话语权，还增加了土地的流通与利用率，提高了土地的产量，有利于大规模的产业化的高科技农业的发展，阿里在农业上的探索将会引起新一轮的产业调整，但是集体化之后农民的个体的话语权将更少，国家应出台相应的法律法规保护
Spring注入有继承关系的类（1） bijian1013 java spring
一个类一个类的注入 1.AClass类 package com.bijian.spring.test2; public class AClass { String a; String b; public String getA() { return a; } public void setA(Strin
30岁转型期你能否成为成功人士 bijian1013 成功
很多人由于年轻时走了弯路，到了30岁一事无成，这样的例子大有人在。但同样也有一些人，整个职业生涯都发展得很优秀，到了30岁已经成为职场的精英阶层。由于做猎头的原因，我们接触很多30岁左右的经理人，发现他们在职业发展道路上往往有很多致命的问题。在30岁之前，他们的职业生涯表现很优秀，但从30岁到40岁这一段，很多人
[Velocity三]基于Servlet+Velocity的web应用 bit1129 velocity
什么是VelocityViewServlet 使用org.apache.velocity.tools.view.VelocityViewServlet可以将Velocity集成到基于Servlet的web应用中，以Servlet+Velocity的方式实现web应用 Servlet + Velocity的一般步骤 1.自定义Servlet，实现VelocityViewServl
【Kafka十二】关于Kafka是一个Commit Log Service bit1129 service
Kafka is a distributed, partitioned, replicated commit log service.这里的commit log如何理解？ A message is considered "committed" when all in sync replicas for that partition have applied i
NGINX + LUA实现复杂的控制 ronin47 lua nginx 控制
安装lua_nginx_module 模块 lua_nginx_module 可以一步步的安装，也可以直接用淘宝的OpenResty Centos和debian的安装就简单了。。这里说下freebsd的安装： fetch http://www.lua.org/ftp/lua-5.1.4.tar.gz tar zxvf lua-5.1.4.tar.gz cd lua-5.1.4 ma
java-14.输入一个已经按升序排序过的数组和一个数字，在数组中查找两个数，使得它们的和正好是输入的那个数字 bylijinnan java
public class TwoElementEqualSum { /** * 第 14 题：题目：输入一个已经按升序排序过的数组和一个数字，在数组中查找两个数，使得它们的和正好是输入的那个数字。要求时间复杂度是 O(n) 。如果有多对数字的和等于输入的数字，输出任意一对即可。例如输入数组 1 、 2 、 4 、 7 、 11 、 15 和数字 15 。由于
Netty源码学习-HttpChunkAggregator-HttpRequestEncoder-HttpResponseDecoder bylijinnan java netty
今天看Netty如何实现一个Http Server org.jboss.netty.example.http.file.HttpStaticFileServerPipelineFactory： pipeline.addLast("decoder", new HttpRequestDecoder()); pipeline.addLast(&quo
java敏感词过虑-基于多叉树原理 cngolon 违禁词过虑替换违禁词敏感词过虑多叉树
基于多叉树的敏感词、关键词过滤的工具包，用于java中的敏感词过滤 1、工具包自带敏感词词库，第一次调用时读入词库，故第一次调用时间可能较长，在类加载后普通pc机上html过滤5000字在80毫秒左右，纯文本35毫秒左右。 2、如需自定义词库，将jar包考入WEB-INF工程的lib目录，在WEB-INF/classes目录下建一个 utf-8的words.dict文本文件，
多线程知识 cuishikuan 多线程
T1，T2，T3三个线程工作顺序，按照T1，T2，T3依次进行 public class T1 implements Runnable{ @Override
spring整合activemq dalan_123 java spring jms
整合spring和activemq需要搞清楚如下的东东1、ConnectionFactory分： a、spring管理连接到activemq服务器的管理ConnectionFactory也即是所谓产生到jms服务器的链接 b、真正产生到JMS服务器链接的ConnectionFactory还得
MySQL时间字段究竟使用INT还是DateTime？ dcj3sjt126com mysql
环境：Windows XPPHP Version 5.2.9MySQL Server 5.1 第一步、创建一个表date_test（非定长、int时间） CREATE TABLE `test`.`date_test` (`id` INT NOT NULL AUTO_INCREMENT ,`start_time` INT NOT NULL ,`some_content`
Parcel: unable to marshal value dcj3sjt126com marshal
在两个activity直接传递List<xxInfo>时，出现Parcel: unable to marshal value异常。在MainActivity页面（MainActivity页面向NextActivity页面传递一个List<xxInfo>）： Intent intent = new Intent(this, Next
linux进程的查看上（ps） eksliang linux ps linux ps -l linux ps aux
ps:将某个时间点的进程运行情况选取下来转载请出自出处：http://eksliang.iteye.com/admin/blogs/2119469 http://eksliang.iteye.com ps 这个命令的man page 不是很好查阅，因为很多不同的Unix都使用这儿ps来查阅进程的状态，为了要符合不同版本的需求，所以这个
为什么第三方应用能早于System的app启动 gqdy365 System
Android应用的启动顺序网上有一大堆资料可以查阅了，这里就不细述了，这里不阐述ROM启动还有bootloader，软件启动的大致流程应该是启动kernel -> 运行servicemanager 把一些native的服务用命令启动起来（包括wifi, power, rild, surfaceflinger, mediaserver等等）-> 启动Dalivk中的第一个进程Zygot
App Framework发送JSONP请求(3) hw1287789687 jsonp 跨域请求发送jsonp ajax请求越狱请求
App Framework 中如何发送JSONP请求呢? 使用jsonp,详情请参考:http://json-p.org/ 如何发送Ajax请求呢? (1)登录 /*** * 会员登录 * @param username * @param password */ var user_login=function(username,password){ // aler
发福利，整理了一份关于“资源汇总”的汇总 justjavac 资源
觉得有用的话，可以去github关注：https://github.com/justjavac/awesome-awesomeness-zh_CN 通用 free-programming-books-zh_CN 免费的计算机编程类中文书籍精彩博客集合 hacke2/hacke2.github.io#2 ResumeSample 程序员简历
用 Java 技术创建 RESTful Web 服务 macroli java 编程 Web REST
转载：http://www.ibm.com/developerworks/cn/web/wa-jaxrs/ JAX-RS (JSR-311) 【 Java API for RESTful Web Services 】是一种 Java™ API，可使 Java Restful 服务的开发变得迅速而轻松。这个 API 提供了一种基于注释的模型来描述分布式资源。注释被用来提供资源的位
CentOS6.5-x86_64位下oracle11g的安装详细步骤及注意事项超声波 oracle linux
前言：这两天项目要上线了，由我负责往服务器部署整个项目，因此首先要往服务器安装oracle，服务器本身是CentOS6.5的64位系统，安装的数据库版本是11g，在整个的安装过程中碰到很多的坑，不过最后还是通过各种途径解决并成功装上了。转别写篇博客来记录完整的安装过程以及在整个过程中的注意事项。希望对以后那些刚刚接触的菜鸟们能起到一定的帮助作用。安装过程中可能遇到的问题（注
HttpClient 4.3 设置keeplive 和 timeout 的方法 supben httpclient
ConnectionKeepAliveStrategy kaStrategy = new DefaultConnectionKeepAliveStrategy() { @Override public long getKeepAliveDuration(HttpResponse response, HttpContext context) { long keepAlive
Spring 4.2新特性-@Import注解的升级 wiselyman spring 4
3.1 @Import @Import注解在4.2之前只支持导入配置类在4.2,@Import注解支持导入普通的java类,并将其声明成一个bean 3.2 示例演示java类 package com.wisely.spring4_2.imp; public class DemoService { public void doSomethin