输入数据格式
使用matplotlib包绘制散点图
# 导入所需的python包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# 设置绘图格式
plt.style.use('seaborn')
%matplotlib inline
# 创建示例数据集
df=pd.DataFrame({'x': range(1,101), 'y': np.random.randn(100)*15+range(1,101) })
# 查看示例数据头和尾的各5行
df.head(5).append(df.tail(5))
|
x |
y |
0 |
1 |
7.821203 |
1 |
2 |
8.372683 |
2 |
3 |
10.616092 |
3 |
4 |
-0.183374 |
4 |
5 |
18.387730 |
95 |
96 |
101.110453 |
96 |
97 |
102.630476 |
97 |
98 |
90.080476 |
98 |
99 |
121.161754 |
99 |
100 |
78.376947 |
# 绘制基础散点图
plt.plot( 'x', 'y', data=df, linestyle='none', marker='o')
plt.show()
设置点的形状
# marker参数设置点的形状
# === first figure:
plt.plot( 'x', 'y', data=df, linestyle='none', marker='*')
plt.show()
# === second figure:
# 所有点的类型
all_poss=['.','o','v','^','>','<','s','p','*','h','H','D','d','1','','']
# to see all possibilities:
# markers.MarkerStyle.markers.keys()
# set the limit of x and y axis:
# 设置x和y轴的范围
plt.xlim(0.5,4.5)
plt.ylim(0.5,4.5)
# remove ticks and values of axis:
# 去除x和y轴的刻度
plt.xticks([])
plt.yticks([])
#plt.set_xlabel(size=0)
# Make a loop to add markers one by one
num=0
for x in range(1,5):
for y in range(1,5):
num += 1
plt.plot(x,y,marker=all_poss[num-1], markerfacecolor='orange', markersize=23, markeredgecolor="black")
# add text annotation
plt.text(x+0.2, y, all_poss[num-1], horizontalalignment='left', size='medium', color='black', weight='semibold')
设置点的大小
# markersize参数设置点的大小
plt.plot( 'x', 'y', data=df, linestyle='none', marker='D', markersize=16)
plt.show()
设置点的颜色
# markerfacecolor参数设置点的颜色,markeredgecolor参数设置点边框的颜色, markeredgewidth参数设置点边框的宽度
plt.plot( 'x', 'y', data=df, linestyle='none', marker="o", markersize=16, markerfacecolor='skyblue', markeredgecolor="black")
plt.show()
plt.plot( 'x', 'y', data=df, linestyle='none', marker='D', markersize=16, markeredgecolor="orange", markeredgewidth=5)
plt.show()
# 添加连接线,linestyle参数设置线的类型
plt.plot( 'x', 'y', data=df, linestyle='-', marker='o')
plt.show()
添加注释信息
# Basic chart
df=pd.DataFrame({'x': range(1,101), 'y': np.random.randn(100)*15+range(1,101) })
plt.plot( 'x', 'y', data=df, linestyle='none', marker='o')
# 添加文本注释和箭头
# Annotate with text + Arrow
plt.annotate(
# Label and coordinate
'This point is interesting!', xy=(25, 50), xytext=(0, 80),
# Custom arrow
arrowprops=dict(facecolor='black', shrink=0.05)
)
# plot
df=pd.DataFrame({'x': range(1,101), 'y': np.random.randn(100)*15+range(1,101) })
plt.plot( 'x', 'y', data=df, linestyle='none', marker='o')
# Annotation
plt.text(40, 00, r'equation: $\sum_{i=0}^\infty x_i/pre>, fontsize=20)
# Plot
df=pd.DataFrame({'x': range(1,101), 'y': np.random.randn(100)*15+range(1,101) })
plt.plot( 'x', 'y', data=df, linestyle='none', marker='o')
# Annotation
# 添加垂直线
plt.axvline(40, color='r')
# 添加水平线
plt.axhline(40, color='green')
# libraries
import matplotlib.patches as patches
# Plot
fig1 = plt.figure()
ax1 = fig1.add_subplot(111)
ax1.plot( 'x', 'y', data=df, linestyle='none', marker='o')
# Add rectangle
# 添加矩形区
ax1.add_patch(
patches.Rectangle(
(20, 25), # (x,y)
50, # width
50, # height
# You can add rotation as well with 'angle'
alpha=0.3, facecolor="red", edgecolor="black", linewidth=3, linestyle='solid'
)
)
# Plot
fig1 = plt.figure()
ax1 = fig1.add_subplot(111)
ax1.plot( 'x', 'y', data=df, linestyle='none', marker='o')
# Annotation
# 添加圆圈区
ax1.add_patch(
patches.Circle(
(40, 35), # (x,y)
30, # radius
alpha=0.3, facecolor="green", edgecolor="black", linewidth=1, linestyle='solid'
)
)
避免点的重叠
# Dataset:
# 构建示例数据集
df=pd.DataFrame({'x': np.random.normal(10, 1.2, 20000), 'y': np.random.normal(10, 1.2, 20000), 'group': np.repeat('A',20000) })
tmp1=pd.DataFrame({'x': np.random.normal(14.5, 1.2, 20000), 'y': np.random.normal(14.5, 1.2, 20000), 'group': np.repeat('B',20000) })
tmp2=pd.DataFrame({'x': np.random.normal(9.5, 1.5, 20000), 'y': np.random.normal(15.5, 1.5, 20000), 'group': np.repeat('C',20000) })
df=df.append(tmp1).append(tmp2)
df.head(10)
|
x |
y |
group |
0 |
11.529794 |
11.000711 |
A |
1 |
10.524043 |
11.541500 |
A |
2 |
9.845806 |
9.156706 |
A |
3 |
10.970836 |
9.428074 |
A |
4 |
10.748096 |
12.098970 |
A |
5 |
9.455139 |
8.636227 |
A |
6 |
8.094581 |
8.518158 |
A |
7 |
10.259945 |
9.168257 |
A |
8 |
9.420490 |
10.227326 |
A |
9 |
7.124481 |
9.170850 |
A |
# plot
plt.plot( 'x', 'y', data=df, linestyle='', marker='o')
# 设置x轴标签
plt.xlabel('Value of X')
# 设置y轴标签
plt.ylabel('Value of Y')
# 设置标题
plt.title('Overplotting looks like that:', loc='left')
# 更改点的大小
# Plot with small marker size
plt.plot( 'x', 'y', data=df, linestyle='', marker='o', markersize=0.7)
plt.xlabel('Value of X')
plt.ylabel('Value of Y')
plt.title('Overplotting? Try to reduce the dot size', loc='left')
# 设置点的透明度
# Plot with transparency
plt.plot( 'x', 'y', data=df, linestyle='', marker='o', markersize=3, alpha=0.05, color="red")
# Titles
plt.xlabel('Value of X')
plt.ylabel('Value of Y')
plt.title('Overplotting? Try to use transparency', loc='left')
# 随机取样
# Sample 1000 random lines
# 随机取100行数据
df_sample=df.sample(1000)
# Make the plot with this subset
plt.plot( 'x', 'y', data=df_sample, linestyle='', marker='o')
# titles
plt.xlabel('Value of X')
plt.ylabel('Value of Y')
plt.title('Overplotting? Sample your data', loc='left')
使用seaborn包绘制散点图
# library & dataset
import seaborn as sns
# 加载内置数据集
df = sns.load_dataset('iris')
# 查看示例数据
df.head(5).append(df.tail(5))
|
sepal_length |
sepal_width |
petal_length |
petal_width |
species |
0 |
5.1 |
3.5 |
1.4 |
0.2 |
setosa |
1 |
4.9 |
3.0 |
1.4 |
0.2 |
setosa |
2 |
4.7 |
3.2 |
1.3 |
0.2 |
setosa |
3 |
4.6 |
3.1 |
1.5 |
0.2 |
setosa |
4 |
5.0 |
3.6 |
1.4 |
0.2 |
setosa |
145 |
6.7 |
3.0 |
5.2 |
2.3 |
virginica |
146 |
6.3 |
2.5 |
5.0 |
1.9 |
virginica |
147 |
6.5 |
3.0 |
5.2 |
2.0 |
virginica |
148 |
6.2 |
3.4 |
5.4 |
2.3 |
virginica |
149 |
5.9 |
3.0 |
5.1 |
1.8 |
virginica |
使用regplot函数绘制散点图
# 查看regplot的用法
help(sns.regplot)
regplot(x, y, data=None, x_estimator=None, x_bins=None, x_ci='ci', scatter=True, fit_reg=True, ci=95, n_boot=1000, units=None, order=1, logistic=False, lowess=False, robust=False, logx=False, x_partial=None, y_partial=None, truncate=False, dropna=True, x_jitter=None, y_jitter=None, label=None, color=None, marker='o', scatter_kws=None, line_kws=None, ax=None)
Plot data and a linear regression model fit.
There are a number of mutually exclusive options for estimating the
regression model. See the :ref:`tutorial ` for more
information.
Parameters
----------
x, y: string, series, or vector array
Input variables. If strings, these should correspond with column names
in ``data``. When pandas objects are used, axes will be labeled with
the series name.
data : DataFrame
Tidy ("long-form") dataframe where each column is a variable and each
row is an observation.
x_estimator : callable that maps vector -> scalar, optional
Apply this function to each unique value of ``x`` and plot the
resulting estimate. This is useful when ``x`` is a discrete variable.
If ``x_ci`` is given, this estimate will be bootstrapped and a
confidence interval will be drawn.
x_bins : int or vector, optional
Bin the ``x`` variable into discrete bins and then estimate the central
tendency and a confidence interval. This binning only influences how
the scatterplot is drawn; the regression is still fit to the original
data. This parameter is interpreted either as the number of
evenly-sized (not necessary spaced) bins or the positions of the bin
centers. When this parameter is used, it implies that the default of
``x_estimator`` is ``numpy.mean``.
x_ci : "ci", "sd", int in [0, 100] or None, optional
Size of the confidence interval used when plotting a central tendency
for discrete values of ``x``. If ``"ci"``, defer to the value of the
``ci`` parameter. If ``"sd"``, skip bootstrapping and show the
standard deviation of the observations in each bin.
scatter : bool, optional
If ``True``, draw a scatterplot with the underlying observations (or
the ``x_estimator`` values).
fit_reg : bool, optional
If ``True``, estimate and plot a regression model relating the ``x``
and ``y`` variables.
ci : int in [0, 100] or None, optional
Size of the confidence interval for the regression estimate. This will
be drawn using translucent bands around the regression line. The
confidence interval is estimated using a bootstrap; for large
datasets, it may be advisable to avoid that computation by setting
this parameter to None.
n_boot : int, optional
Number of bootstrap resamples used to estimate the ``ci``. The default
value attempts to balance time and stability; you may want to increase
this value for "final" versions of plots.
units : variable name in ``data``, optional
If the ``x`` and ``y`` observations are nested within sampling units,
those can be specified here. This will be taken into account when
computing the confidence intervals by performing a multilevel bootstrap
that resamples both units and observations (within unit). This does not
otherwise influence how the regression is estimated or drawn.
order : int, optional
If ``order`` is greater than 1, use ``numpy.polyfit`` to estimate a
polynomial regression.
logistic : bool, optional
If ``True``, assume that ``y`` is a binary variable and use
``statsmodels`` to estimate a logistic regression model. Note that this
is substantially more computationally intensive than linear regression,
so you may wish to decrease the number of bootstrap resamples
(``n_boot``) or set ``ci`` to None.
lowess : bool, optional
If ``True``, use ``statsmodels`` to estimate a nonparametric lowess
model (locally weighted linear regression). Note that confidence
intervals cannot currently be drawn for this kind of model.
robust : bool, optional
If ``True``, use ``statsmodels`` to estimate a robust regression. This
will de-weight outliers. Note that this is substantially more
computationally intensive than standard linear regression, so you may
wish to decrease the number of bootstrap resamples (``n_boot``) or set
``ci`` to None.
logx : bool, optional
If ``True``, estimate a linear regression of the form y ~ log(x), but
plot the scatterplot and regression model in the input space. Note that
``x`` must be positive for this to work.
{x,y}_partial : strings in ``data`` or matrices
Confounding variables to regress out of the ``x`` or ``y`` variables
before plotting.
truncate : bool, optional
By default, the regression line is drawn to fill the x axis limits
after the scatterplot is drawn. If ``truncate`` is ``True``, it will
instead by bounded by the data limits.
{x,y}_jitter : floats, optional
Add uniform random noise of this size to either the ``x`` or ``y``
variables. The noise is added to a copy of the data after fitting the
regression, and only influences the look of the scatterplot. This can
be helpful when plotting variables that take discrete values.
label : string
Label to apply to ether the scatterplot or regression line (if
``scatter`` is ``False``) for use in a legend.
color : matplotlib color
Color to apply to all plot elements; will be superseded by colors
passed in ``scatter_kws`` or ``line_kws``.
marker : matplotlib marker code
Marker to use for the scatterplot glyphs.
{scatter,line}_kws : dictionaries
Additional keyword arguments to pass to ``plt.scatter`` and
``plt.plot``.
ax : matplotlib Axes, optional
Axes object to draw the plot onto, otherwise uses the current Axes.
Returns
-------
ax : matplotlib Axes
The Axes object containing the plot.
See Also
--------
lmplot : Combine :func:`regplot` and :class:`FacetGrid` to plot multiple
linear relationships in a dataset.
jointplot : Combine :func:`regplot` and :class:`JointGrid` (when used with
``kind="reg"``).
pairplot : Combine :func:`regplot` and :class:`PairGrid` (when used with
``kind="reg"``).
residplot : Plot the residuals of a linear regression model.
Notes
-----
The :func:`regplot` and :func:`lmplot` functions are closely related, but
the former is an axes-level function while the latter is a figure-level
function that combines :func:`regplot` and :class:`FacetGrid`.
It's also easy to combine combine :func:`regplot` and :class:`JointGrid` or
:class:`PairGrid` through the :func:`jointplot` and :func:`pairplot`
functions, although these do not directly accept all of :func:`regplot`'s
parameters.
Examples
--------
Plot the relationship between two variables in a DataFrame:
.. plot::
:context: close-figs
>>> import seaborn as sns; sns.set(color_codes=True)
>>> tips = sns.load_dataset("tips")
>>> ax = sns.regplot(x="total_bill", y="tip", data=tips)
Plot with two variables defined as numpy arrays; use a different color:
.. plot::
:context: close-figs
>>> import numpy as np; np.random.seed(8)
>>> mean, cov = [4, 6], [(1.5, .7), (.7, 1)]
>>> x, y = np.random.multivariate_normal(mean, cov, 80).T
>>> ax = sns.regplot(x=x, y=y, color="g")
Plot with two variables defined as pandas Series; use a different marker:
.. plot::
:context: close-figs
>>> import pandas as pd
>>> x, y = pd.Series(x, name="x_var"), pd.Series(y, name="y_var")
>>> ax = sns.regplot(x=x, y=y, marker="+")
Use a 68% confidence interval, which corresponds with the standard error
of the estimate:
.. plot::
:context: close-figs
>>> ax = sns.regplot(x=x, y=y, ci=68)
Plot with a discrete ``x`` variable and add some jitter:
.. plot::
:context: close-figs
>>> ax = sns.regplot(x="size", y="total_bill", data=tips, x_jitter=.1)
Plot with a discrete ``x`` variable showing means and confidence intervals
for unique values:
.. plot::
:context: close-figs
>>> ax = sns.regplot(x="size", y="total_bill", data=tips,
... x_estimator=np.mean)
Plot with a continuous variable divided into discrete bins:
.. plot::
:context: close-figs
>>> ax = sns.regplot(x=x, y=y, x_bins=4)
Fit a higher-order polynomial regression and truncate the model prediction:
.. plot::
:context: close-figs
>>> ans = sns.load_dataset("anscombe")
>>> ax = sns.regplot(x="x", y="y", data=ans.loc[ans.dataset == "II"],
... scatter_kws={"s": 80},
... order=2, ci=None, truncate=True)
Fit a robust regression and don't plot a confidence interval:
.. plot::
:context: close-figs
>>> ax = sns.regplot(x="x", y="y", data=ans.loc[ans.dataset == "III"],
... scatter_kws={"s": 80},
... robust=True, ci=None)
Fit a logistic regression; jitter the y variable and use fewer bootstrap
iterations:
.. plot::
:context: close-figs
>>> tips["big_tip"] = (tips.tip / tips.total_bill) > .175
>>> ax = sns.regplot(x="total_bill", y="big_tip", data=tips,
... logistic=True, n_boot=500, y_jitter=.03)
Fit the regression model using log(x) and truncate the model prediction:
.. plot::
:context: close-figs
>>> ax = sns.regplot(x="size", y="total_bill", data=tips,
... x_estimator=np.mean, logx=True, truncate=True)
# 使用regplot函数绘制散点图
sns.regplot(x=df["sepal_length"], y=df["sepal_width"])
#sns.plt.show()
# Without regression fit:
# 去掉回归线
sns.regplot(x=df["sepal_length"], y=df["sepal_width"], fit_reg=False)
#sns.plt.show()
# Change shape of marker
# marker参数设置点的形状
sns.regplot(x=df["sepal_length"], y=df["sepal_width"], marker="+", fit_reg=False)
#sns.plt.show()
# More marker customization:
# 使用scatter_kws参数设置点的颜色、透明度和大小
sns.regplot(x=df["sepal_length"], y=df["sepal_width"], fit_reg=False, scatter_kws={"color":"darkred","alpha":0.3,"s":200} )
#sns.plt.show()
使用lmplot函数绘制散点图
# Use the 'hue' argument to provide a factor variable
# hue参数设置分类变量颜色
sns.lmplot( x="sepal_length", y="sepal_width", data=df, fit_reg=False, hue='species', legend=False)
# Move the legend to an empty part of the plot
plt.legend(loc='lower right')
#sns.plt.show()
# give a list to the marker argument
# markers参数设置点的形状
sns.lmplot( x="sepal_length", y="sepal_width", data=df, fit_reg=False, hue='species', legend=False, markers=["o", "x", "1"])
# Move the legend to an empty part of the plot
plt.legend(loc='lower right')
#sns.plt.show()
# Use the 'palette' argument
# palette参数设置颜色画板
sns.lmplot( x="sepal_length", y="sepal_width", data=df, fit_reg=False, hue='species', legend=True, palette="Set2")
# Move the legend to an empty part of the plot
#plt.legend(loc='lower right')
#sns.plt.show()
使用jointplot函数绘制边际图
# Custom the inside plot: options are: “scatter” | “reg” | “resid” | “kde” | “hex”
# kind参数设置绘图类型
sns.jointplot(x=df["sepal_length"], y=df["sepal_width"], kind='scatter')
sns.jointplot(x=df["sepal_length"], y=df["sepal_width"], kind='hex')
sns.jointplot(x=df["sepal_length"], y=df["sepal_width"], kind='kde')
# Then you can pass arguments to each type:
# 设置点和线颜色,大小
sns.jointplot(x=df["sepal_length"], y=df["sepal_width"], kind='scatter', s=200, color='m', edgecolor="skyblue", linewidth=2)
# Custom the color
sns.set(style="white", color_codes=True)
sns.jointplot(x=df["sepal_length"], y=df["sepal_width"], kind='kde', color="skyblue")
使用pairplot函数绘制配对散点图
# first
sns.pairplot(df, kind="scatter", hue="species", markers=["o", "s", "D"], palette="Set2")
plt.show()
# second: you can give other arguments with plot_kws.
sns.pairplot(df, kind="scatter", hue="species", plot_kws=dict(s=80, edgecolor="white", linewidth=2.5))
plt.show()
参考来源:https://python-graph-gallery.com/scatter-plot/