1.1载入数据
任务1:导入numpy和pandas
import numpy as np
import pandas as pd
import os
任务二:载入数据
(1) 使用相对路径载入
cwd = os.getcwd()
os.chdir("D:\datasets\Titanic")
df = pd.read_csv('train.csv')
df.head()
|
PassengerId |
Survived |
Pclass |
Name |
Sex |
Age |
SibSp |
Parch |
Ticket |
Fare |
Cabin |
Embarked |
0 |
1 |
0 |
3 |
Braund, Mr. Owen Harris |
male |
22.0 |
1 |
0 |
A/5 21171 |
7.2500 |
NaN |
S |
1 |
2 |
1 |
1 |
Cumings, Mrs. John Bradley (Florence Briggs Th... |
female |
38.0 |
1 |
0 |
PC 17599 |
71.2833 |
C85 |
C |
2 |
3 |
1 |
3 |
Heikkinen, Miss. Laina |
female |
26.0 |
0 |
0 |
STON/O2. 3101282 |
7.9250 |
NaN |
S |
3 |
4 |
1 |
1 |
Futrelle, Mrs. Jacques Heath (Lily May Peel) |
female |
35.0 |
1 |
0 |
113803 |
53.1000 |
C123 |
S |
4 |
5 |
0 |
3 |
Allen, Mr. William Henry |
male |
35.0 |
0 |
0 |
373450 |
8.0500 |
NaN |
S |
(2) 使用绝对路径载入数据
df = pd.read_csv('D:\\datasets\\Titanic\\train.csv')
df.head()
|
PassengerId |
Survived |
Pclass |
Name |
Sex |
Age |
SibSp |
Parch |
Ticket |
Fare |
Cabin |
Embarked |
0 |
1 |
0 |
3 |
Braund, Mr. Owen Harris |
male |
22.0 |
1 |
0 |
A/5 21171 |
7.2500 |
NaN |
S |
1 |
2 |
1 |
1 |
Cumings, Mrs. John Bradley (Florence Briggs Th... |
female |
38.0 |
1 |
0 |
PC 17599 |
71.2833 |
C85 |
C |
2 |
3 |
1 |
3 |
Heikkinen, Miss. Laina |
female |
26.0 |
0 |
0 |
STON/O2. 3101282 |
7.9250 |
NaN |
S |
3 |
4 |
1 |
1 |
Futrelle, Mrs. Jacques Heath (Lily May Peel) |
female |
35.0 |
1 |
0 |
113803 |
53.1000 |
C123 |
S |
4 |
5 |
0 |
3 |
Allen, Mr. William Henry |
male |
35.0 |
0 |
0 |
373450 |
8.0500 |
NaN |
S |
任务三:每1000行为一个数据模块,逐块读取
chunker = pd.read_csv('train.csv', chunksize=1000)
chunker
任务四:将表头改成中文,索引改为乘客ID
df = pd.read_csv('train.csv', names=['乘客ID','是否幸存','仓位等级','姓名','性别','年龄','兄弟姐妹个数','父母子女个数','船票信息','票价','客舱','登船港口'],index_col='乘客ID',header=0)
df.head()
|
是否幸存 |
仓位等级 |
姓名 |
性别 |
年龄 |
兄弟姐妹个数 |
父母子女个数 |
船票信息 |
票价 |
客舱 |
登船港口 |
乘客ID |
|
|
|
|
|
|
|
|
|
|
|
1 |
0 |
3 |
Braund, Mr. Owen Harris |
male |
22.0 |
1 |
0 |
A/5 21171 |
7.2500 |
NaN |
S |
2 |
1 |
1 |
Cumings, Mrs. John Bradley (Florence Briggs Th... |
female |
38.0 |
1 |
0 |
PC 17599 |
71.2833 |
C85 |
C |
3 |
1 |
3 |
Heikkinen, Miss. Laina |
female |
26.0 |
0 |
0 |
STON/O2. 3101282 |
7.9250 |
NaN |
S |
4 |
1 |
1 |
Futrelle, Mrs. Jacques Heath (Lily May Peel) |
female |
35.0 |
1 |
0 |
113803 |
53.1000 |
C123 |
S |
5 |
0 |
3 |
Allen, Mr. William Henry |
male |
35.0 |
0 |
0 |
373450 |
8.0500 |
NaN |
S |
1.2初步观察
任务一:查看数据的基本信息
df.info
df.head(10)
|
是否幸存 |
仓位等级 |
姓名 |
性别 |
年龄 |
兄弟姐妹个数 |
父母子女个数 |
船票信息 |
票价 |
客舱 |
登船港口 |
乘客ID |
|
|
|
|
|
|
|
|
|
|
|
1 |
0 |
3 |
Braund, Mr. Owen Harris |
male |
22.0 |
1 |
0 |
A/5 21171 |
7.2500 |
NaN |
S |
2 |
1 |
1 |
Cumings, Mrs. John Bradley (Florence Briggs Th... |
female |
38.0 |
1 |
0 |
PC 17599 |
71.2833 |
C85 |
C |
3 |
1 |
3 |
Heikkinen, Miss. Laina |
female |
26.0 |
0 |
0 |
STON/O2. 3101282 |
7.9250 |
NaN |
S |
4 |
1 |
1 |
Futrelle, Mrs. Jacques Heath (Lily May Peel) |
female |
35.0 |
1 |
0 |
113803 |
53.1000 |
C123 |
S |
5 |
0 |
3 |
Allen, Mr. William Henry |
male |
35.0 |
0 |
0 |
373450 |
8.0500 |
NaN |
S |
6 |
0 |
3 |
Moran, Mr. James |
male |
NaN |
0 |
0 |
330877 |
8.4583 |
NaN |
Q |
7 |
0 |
1 |
McCarthy, Mr. Timothy J |
male |
54.0 |
0 |
0 |
17463 |
51.8625 |
E46 |
S |
8 |
0 |
3 |
Palsson, Master. Gosta Leonard |
male |
2.0 |
3 |
1 |
349909 |
21.0750 |
NaN |
S |
9 |
1 |
3 |
Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) |
female |
27.0 |
0 |
2 |
347742 |
11.1333 |
NaN |
S |
10 |
1 |
2 |
Nasser, Mrs. Nicholas (Adele Achem) |
female |
14.0 |
1 |
0 |
237736 |
30.0708 |
NaN |
C |
df.tail(15)
|
是否幸存 |
仓位等级 |
姓名 |
性别 |
年龄 |
兄弟姐妹个数 |
父母子女个数 |
船票信息 |
票价 |
客舱 |
登船港口 |
乘客ID |
|
|
|
|
|
|
|
|
|
|
|
877 |
0 |
3 |
Gustafsson, Mr. Alfred Ossian |
male |
20.0 |
0 |
0 |
7534 |
9.8458 |
NaN |
S |
878 |
0 |
3 |
Petroff, Mr. Nedelio |
male |
19.0 |
0 |
0 |
349212 |
7.8958 |
NaN |
S |
879 |
0 |
3 |
Laleff, Mr. Kristo |
male |
NaN |
0 |
0 |
349217 |
7.8958 |
NaN |
S |
880 |
1 |
1 |
Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) |
female |
56.0 |
0 |
1 |
11767 |
83.1583 |
C50 |
C |
881 |
1 |
2 |
Shelley, Mrs. William (Imanita Parrish Hall) |
female |
25.0 |
0 |
1 |
230433 |
26.0000 |
NaN |
S |
882 |
0 |
3 |
Markun, Mr. Johann |
male |
33.0 |
0 |
0 |
349257 |
7.8958 |
NaN |
S |
883 |
0 |
3 |
Dahlberg, Miss. Gerda Ulrika |
female |
22.0 |
0 |
0 |
7552 |
10.5167 |
NaN |
S |
884 |
0 |
2 |
Banfield, Mr. Frederick James |
male |
28.0 |
0 |
0 |
C.A./SOTON 34068 |
10.5000 |
NaN |
S |
885 |
0 |
3 |
Sutehall, Mr. Henry Jr |
male |
25.0 |
0 |
0 |
SOTON/OQ 392076 |
7.0500 |
NaN |
S |
886 |
0 |
3 |
Rice, Mrs. William (Margaret Norton) |
female |
39.0 |
0 |
5 |
382652 |
29.1250 |
NaN |
Q |
887 |
0 |
2 |
Montvila, Rev. Juozas |
male |
27.0 |
0 |
0 |
211536 |
13.0000 |
NaN |
S |
888 |
1 |
1 |
Graham, Miss. Margaret Edith |
female |
19.0 |
0 |
0 |
112053 |
30.0000 |
B42 |
S |
889 |
0 |
3 |
Johnston, Miss. Catherine Helen "Carrie" |
female |
NaN |
1 |
2 |
W./C. 6607 |
23.4500 |
NaN |
S |
890 |
1 |
1 |
Behr, Mr. Karl Howell |
male |
26.0 |
0 |
0 |
111369 |
30.0000 |
C148 |
C |
891 |
0 |
3 |
Dooley, Mr. Patrick |
male |
32.0 |
0 |
0 |
370376 |
7.7500 |
NaN |
Q |
任务三:判断数据是否为空,为空的地方返回True,其余地方返回False
df.isnull()
|
是否幸存 |
仓位等级 |
姓名 |
性别 |
年龄 |
兄弟姐妹个数 |
父母子女个数 |
船票信息 |
票价 |
客舱 |
登船港口 |
乘客ID |
|
|
|
|
|
|
|
|
|
|
|
1 |
False |
False |
False |
False |
False |
False |
False |
False |
False |
True |
False |
2 |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
3 |
False |
False |
False |
False |
False |
False |
False |
False |
False |
True |
False |
4 |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
5 |
False |
False |
False |
False |
False |
False |
False |
False |
False |
True |
False |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
887 |
False |
False |
False |
False |
False |
False |
False |
False |
False |
True |
False |
888 |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
889 |
False |
False |
False |
False |
True |
False |
False |
False |
False |
True |
False |
890 |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
891 |
False |
False |
False |
False |
False |
False |
False |
False |
False |
True |
False |
891 rows × 11 columns
1.3 保存数据
任务一:将你加载并做出改变的数据,在工作目录下保存为一个新文件train_chinese.csv
df.to_csv('train.chinese.csv')
2.1知道你的数据叫什么
任务一:pandas中有两个数据类型DateFrame和Series,通过查找简单了解他们。然后自己写一个关于这两个数据类型的小例子
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
example_1 = pd.Series(sdata)
example_1
Ohio 35000
Texas 71000
Oregon 16000
Utah 5000
dtype: int64
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'], 'year': [2000, 2001, 2002, 2001, 2002, 2003],'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
example_2 = pd.DataFrame(data)
example_2
|
state |
year |
pop |
0 |
Ohio |
2000 |
1.5 |
1 |
Ohio |
2001 |
1.7 |
2 |
Ohio |
2002 |
3.6 |
3 |
Nevada |
2001 |
2.4 |
4 |
Nevada |
2002 |
2.9 |
5 |
Nevada |
2003 |
3.2 |
任务二:根据上节课的方法载入"train.csv"文件
df=pd.read_csv('train.chinese.csv')
df.head()
|
乘客ID |
是否幸存 |
仓位等级 |
姓名 |
性别 |
年龄 |
兄弟姐妹个数 |
父母子女个数 |
船票信息 |
票价 |
客舱 |
登船港口 |
0 |
1 |
0 |
3 |
Braund, Mr. Owen Harris |
male |
22.0 |
1 |
0 |
A/5 21171 |
7.2500 |
NaN |
S |
1 |
2 |
1 |
1 |
Cumings, Mrs. John Bradley (Florence Briggs Th... |
female |
38.0 |
1 |
0 |
PC 17599 |
71.2833 |
C85 |
C |
2 |
3 |
1 |
3 |
Heikkinen, Miss. Laina |
female |
26.0 |
0 |
0 |
STON/O2. 3101282 |
7.9250 |
NaN |
S |
3 |
4 |
1 |
1 |
Futrelle, Mrs. Jacques Heath (Lily May Peel) |
female |
35.0 |
1 |
0 |
113803 |
53.1000 |
C123 |
S |
4 |
5 |
0 |
3 |
Allen, Mr. William Henry |
male |
35.0 |
0 |
0 |
373450 |
8.0500 |
NaN |
S |
任务三:查看DataFrame数据的每列的项
df.columns
Index(['乘客ID', '是否幸存', '仓位等级', '姓名', '性别', '年龄', '兄弟姐妹个数', '父母子女个数', '船票信息',
'票价', '客舱', '登船港口'],
dtype='object')
任务四:查看"cabin"这列的所有项
dir(df['客舱'])
['T',
'_AXIS_ALIASES',
'_AXIS_IALIASES',
'_AXIS_LEN',
'_AXIS_NAMES',
'_AXIS_NUMBERS',
'_AXIS_ORDERS',
'_AXIS_REVERSED',
'_HANDLED_TYPES',
'__abs__',
'__add__',
'__and__',
'__annotations__',
'__array__',
'__array_priority__',
'__array_ufunc__',
'__array_wrap__',
'__bool__',
'__class__',
'__contains__',
'__copy__',
'__deepcopy__',
'__delattr__',
'__delitem__',
'__dict__',
'__dir__',
'__div__',
'__divmod__',
'__doc__',
'__eq__',
'__finalize__',
'__float__',
'__floordiv__',
'__format__',
'__ge__',
'__getattr__',
'__getattribute__',
'__getitem__',
'__getstate__',
'__gt__',
'__hash__',
'__iadd__',
'__iand__',
'__ifloordiv__',
'__imod__',
'__imul__',
'__init__',
'__init_subclass__',
'__int__',
'__invert__',
'__ior__',
'__ipow__',
'__isub__',
'__iter__',
'__itruediv__',
'__ixor__',
'__le__',
'__len__',
'__long__',
'__lt__',
'__matmul__',
'__mod__',
'__module__',
'__mul__',
'__ne__',
'__neg__',
'__new__',
'__nonzero__',
'__or__',
'__pos__',
'__pow__',
'__radd__',
'__rand__',
'__rdiv__',
'__rdivmod__',
'__reduce__',
'__reduce_ex__',
'__repr__',
'__rfloordiv__',
'__rmatmul__',
'__rmod__',
'__rmul__',
'__ror__',
'__round__',
'__rpow__',
'__rsub__',
'__rtruediv__',
'__rxor__',
'__setattr__',
'__setitem__',
'__setstate__',
'__sizeof__',
'__str__',
'__sub__',
'__subclasshook__',
'__truediv__',
'__weakref__',
'__xor__',
'_accessors',
'_add_numeric_operations',
'_add_series_or_dataframe_operations',
'_agg_by_level',
'_agg_examples_doc',
'_agg_see_also_doc',
'_aggregate',
'_aggregate_multiple_funcs',
'_align_frame',
'_align_series',
'_binop',
'_box_item_values',
'_builtin_table',
'_can_hold_na',
'_check_inplace_setting',
'_check_is_chained_assignment_possible',
'_check_label_or_level_ambiguity',
'_check_setitem_copy',
'_clear_item_cache',
'_clip_with_one_bound',
'_clip_with_scalar',
'_consolidate',
'_consolidate_inplace',
'_construct_axes_dict',
'_construct_axes_dict_from',
'_construct_axes_from_arguments',
'_constructor',
'_constructor_expanddim',
'_constructor_sliced',
'_convert',
'_convert_dtypes',
'_create_indexer',
'_cython_table',
'_deprecations',
'_dir_additions',
'_dir_deletions',
'_drop_axis',
'_drop_labels_or_levels',
'_find_valid_index',
'_from_axes',
'_get_axis',
'_get_axis_name',
'_get_axis_number',
'_get_axis_resolvers',
'_get_block_manager_axis',
'_get_bool_data',
'_get_cacher',
'_get_cleaned_column_resolvers',
'_get_cython_func',
'_get_index_resolvers',
'_get_item_cache',
'_get_label_or_level_values',
'_get_numeric_data',
'_get_value',
'_get_values',
'_get_values_tuple',
'_get_with',
'_gotitem',
'_iget_item_cache',
'_index',
'_indexed_same',
'_info_axis',
'_info_axis_name',
'_info_axis_number',
'_init_dict',
'_init_mgr',
'_internal_get_values',
'_internal_names',
'_internal_names_set',
'_is_builtin_func',
'_is_cached',
'_is_copy',
'_is_datelike_mixed_type',
'_is_label_or_level_reference',
'_is_label_reference',
'_is_level_reference',
'_is_mixed_type',
'_is_numeric_mixed_type',
'_is_view',
'_ix',
'_ixs',
'_map_values',
'_maybe_cache_changed',
'_maybe_update_cacher',
'_metadata',
'_ndarray_values',
'_needs_reindex_multi',
'_obj_with_exclusions',
'_protect_consolidate',
'_reduce',
'_reindex_axes',
'_reindex_indexer',
'_reindex_multi',
'_reindex_with_indexers',
'_repr_data_resource_',
'_repr_latex_',
'_reset_cache',
'_reset_cacher',
'_selected_obj',
'_selection',
'_selection_list',
'_selection_name',
'_set_as_cached',
'_set_axis',
'_set_axis_name',
'_set_is_copy',
'_set_item',
'_set_labels',
'_set_name',
'_set_subtyp',
'_set_value',
'_set_values',
'_set_with',
'_set_with_engine',
'_setup_axes',
'_slice',
'_stat_axis',
'_stat_axis_name',
'_stat_axis_number',
'_take_with_is_copy',
'_to_dict_of_blocks',
'_try_aggregate_string_function',
'_typ',
'_unpickle_series_compat',
'_update_inplace',
'_validate_dtype',
'_values',
'_where',
'_xs',
'abs',
'add',
'add_prefix',
'add_suffix',
'agg',
'aggregate',
'align',
'all',
'any',
'append',
'apply',
'argmax',
'argmin',
'argsort',
'array',
'asfreq',
'asof',
'astype',
'at',
'at_time',
'attrs',
'autocorr',
'axes',
'between',
'between_time',
'bfill',
'bool',
'clip',
'combine',
'combine_first',
'convert_dtypes',
'copy',
'corr',
'count',
'cov',
'cummax',
'cummin',
'cumprod',
'cumsum',
'describe',
'diff',
'div',
'divide',
'divmod',
'dot',
'drop',
'drop_duplicates',
'droplevel',
'dropna',
'dtype',
'dtypes',
'duplicated',
'empty',
'eq',
'equals',
'ewm',
'expanding',
'explode',
'factorize',
'ffill',
'fillna',
'filter',
'first',
'first_valid_index',
'floordiv',
'ge',
'get',
'groupby',
'gt',
'hasnans',
'head',
'hist',
'iat',
'idxmax',
'idxmin',
'iloc',
'index',
'infer_objects',
'interpolate',
'is_monotonic',
'is_monotonic_decreasing',
'is_monotonic_increasing',
'is_unique',
'isin',
'isna',
'isnull',
'item',
'items',
'iteritems',
'keys',
'kurt',
'kurtosis',
'last',
'last_valid_index',
'le',
'loc',
'lt',
'mad',
'map',
'mask',
'max',
'mean',
'median',
'memory_usage',
'min',
'mod',
'mode',
'mul',
'multiply',
'name',
'nbytes',
'ndim',
'ne',
'nlargest',
'notna',
'notnull',
'nsmallest',
'nunique',
'pct_change',
'pipe',
'plot',
'pop',
'pow',
'prod',
'product',
'quantile',
'radd',
'rank',
'ravel',
'rdiv',
'rdivmod',
'reindex',
'reindex_like',
'rename',
'rename_axis',
'reorder_levels',
'repeat',
'replace',
'resample',
'reset_index',
'rfloordiv',
'rmod',
'rmul',
'rolling',
'round',
'rpow',
'rsub',
'rtruediv',
'sample',
'searchsorted',
'sem',
'set_axis',
'shape',
'shift',
'size',
'skew',
'slice_shift',
'sort_index',
'sort_values',
'squeeze',
'std',
'str',
'sub',
'subtract',
'sum',
'swapaxes',
'swaplevel',
'tail',
'take',
'to_clipboard',
'to_csv',
'to_dict',
'to_excel',
'to_frame',
'to_hdf',
'to_json',
'to_latex',
'to_list',
'to_markdown',
'to_numpy',
'to_period',
'to_pickle',
'to_sql',
'to_string',
'to_timestamp',
'to_xarray',
'transform',
'transpose',
'truediv',
'truncate',
'tshift',
'tz_convert',
'tz_localize',
'unique',
'unstack',
'update',
'value_counts',
'values',
'var',
'view',
'where',
'xs']
df['客舱'].head()
0 NaN
1 C85
2 NaN
3 C123
4 NaN
Name: 客舱, dtype: object
任务五:加载文件"test_1.csv",然后对比"train.csv",看看有哪些多出的列,然后将多出的列删除
test_1 = pd.read_csv("C:\\Users\\Administrator\\Documents\\DataScience\\hands-on-data-analysis\\第一单元项目集合\\test_1.csv")
test_1
|
Unnamed: 0 |
PassengerId |
Survived |
Pclass |
Name |
Sex |
Age |
SibSp |
Parch |
Ticket |
Fare |
Cabin |
Embarked |
a |
0 |
0 |
1 |
0 |
3 |
Braund, Mr. Owen Harris |
male |
22.0 |
1 |
0 |
A/5 21171 |
7.2500 |
NaN |
S |
100 |
1 |
1 |
2 |
1 |
1 |
Cumings, Mrs. John Bradley (Florence Briggs Th... |
female |
38.0 |
1 |
0 |
PC 17599 |
71.2833 |
C85 |
C |
100 |
2 |
2 |
3 |
1 |
3 |
Heikkinen, Miss. Laina |
female |
26.0 |
0 |
0 |
STON/O2. 3101282 |
7.9250 |
NaN |
S |
100 |
3 |
3 |
4 |
1 |
1 |
Futrelle, Mrs. Jacques Heath (Lily May Peel) |
female |
35.0 |
1 |
0 |
113803 |
53.1000 |
C123 |
S |
100 |
4 |
4 |
5 |
0 |
3 |
Allen, Mr. William Henry |
male |
35.0 |
0 |
0 |
373450 |
8.0500 |
NaN |
S |
100 |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
886 |
886 |
887 |
0 |
2 |
Montvila, Rev. Juozas |
male |
27.0 |
0 |
0 |
211536 |
13.0000 |
NaN |
S |
100 |
887 |
887 |
888 |
1 |
1 |
Graham, Miss. Margaret Edith |
female |
19.0 |
0 |
0 |
112053 |
30.0000 |
B42 |
S |
100 |
888 |
888 |
889 |
0 |
3 |
Johnston, Miss. Catherine Helen "Carrie" |
female |
NaN |
1 |
2 |
W./C. 6607 |
23.4500 |
NaN |
S |
100 |
889 |
889 |
890 |
1 |
1 |
Behr, Mr. Karl Howell |
male |
26.0 |
0 |
0 |
111369 |
30.0000 |
C148 |
C |
100 |
890 |
890 |
891 |
0 |
3 |
Dooley, Mr. Patrick |
male |
32.0 |
0 |
0 |
370376 |
7.7500 |
NaN |
Q |
100 |
891 rows × 14 columns
test_1.drop(['a'],axis=1)
|
Unnamed: 0 |
PassengerId |
Survived |
Pclass |
Name |
Sex |
Age |
SibSp |
Parch |
Ticket |
Fare |
Cabin |
Embarked |
0 |
0 |
1 |
0 |
3 |
Braund, Mr. Owen Harris |
male |
22.0 |
1 |
0 |
A/5 21171 |
7.2500 |
NaN |
S |
1 |
1 |
2 |
1 |
1 |
Cumings, Mrs. John Bradley (Florence Briggs Th... |
female |
38.0 |
1 |
0 |
PC 17599 |
71.2833 |
C85 |
C |
2 |
2 |
3 |
1 |
3 |
Heikkinen, Miss. Laina |
female |
26.0 |
0 |
0 |
STON/O2. 3101282 |
7.9250 |
NaN |
S |
3 |
3 |
4 |
1 |
1 |
Futrelle, Mrs. Jacques Heath (Lily May Peel) |
female |
35.0 |
1 |
0 |
113803 |
53.1000 |
C123 |
S |
4 |
4 |
5 |
0 |
3 |
Allen, Mr. William Henry |
male |
35.0 |
0 |
0 |
373450 |
8.0500 |
NaN |
S |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
886 |
886 |
887 |
0 |
2 |
Montvila, Rev. Juozas |
male |
27.0 |
0 |
0 |
211536 |
13.0000 |
NaN |
S |
887 |
887 |
888 |
1 |
1 |
Graham, Miss. Margaret Edith |
female |
19.0 |
0 |
0 |
112053 |
30.0000 |
B42 |
S |
888 |
888 |
889 |
0 |
3 |
Johnston, Miss. Catherine Helen "Carrie" |
female |
NaN |
1 |
2 |
W./C. 6607 |
23.4500 |
NaN |
S |
889 |
889 |
890 |
1 |
1 |
Behr, Mr. Karl Howell |
male |
26.0 |
0 |
0 |
111369 |
30.0000 |
C148 |
C |
890 |
890 |
891 |
0 |
3 |
Dooley, Mr. Patrick |
male |
32.0 |
0 |
0 |
370376 |
7.7500 |
NaN |
Q |
891 rows × 13 columns
任务六: 将[‘PassengerId’,‘Name’,‘Age’,‘Ticket’]这几个列元素隐藏,只观察其他几个列元素
df=pd.read_csv('train.csv')
df.drop(['PassengerId','Name','Age','Ticket'],axis=1)
|
Survived |
Pclass |
Sex |
SibSp |
Parch |
Fare |
Cabin |
Embarked |
0 |
0 |
3 |
male |
1 |
0 |
7.2500 |
NaN |
S |
1 |
1 |
1 |
female |
1 |
0 |
71.2833 |
C85 |
C |
2 |
1 |
3 |
female |
0 |
0 |
7.9250 |
NaN |
S |
3 |
1 |
1 |
female |
1 |
0 |
53.1000 |
C123 |
S |
4 |
0 |
3 |
male |
0 |
0 |
8.0500 |
NaN |
S |
... |
... |
... |
... |
... |
... |
... |
... |
... |
886 |
0 |
2 |
male |
0 |
0 |
13.0000 |
NaN |
S |
887 |
1 |
1 |
female |
0 |
0 |
30.0000 |
B42 |
S |
888 |
0 |
3 |
female |
1 |
2 |
23.4500 |
NaN |
S |
889 |
1 |
1 |
male |
0 |
0 |
30.0000 |
C148 |
C |
890 |
0 |
3 |
male |
0 |
0 |
7.7500 |
NaN |
Q |
891 rows × 8 columns
2.2筛选的逻辑
任务一: 我们以"Age"为筛选条件,显示年龄在10岁以下的乘客信息。
df[df['Age']<10]
|
PassengerId |
Survived |
Pclass |
Name |
Sex |
Age |
SibSp |
Parch |
Ticket |
Fare |
Cabin |
Embarked |
7 |
8 |
0 |
3 |
Palsson, Master. Gosta Leonard |
male |
2.00 |
3 |
1 |
349909 |
21.0750 |
NaN |
S |
10 |
11 |
1 |
3 |
Sandstrom, Miss. Marguerite Rut |
female |
4.00 |
1 |
1 |
PP 9549 |
16.7000 |
G6 |
S |
16 |
17 |
0 |
3 |
Rice, Master. Eugene |
male |
2.00 |
4 |
1 |
382652 |
29.1250 |
NaN |
Q |
24 |
25 |
0 |
3 |
Palsson, Miss. Torborg Danira |
female |
8.00 |
3 |
1 |
349909 |
21.0750 |
NaN |
S |
43 |
44 |
1 |
2 |
Laroche, Miss. Simonne Marie Anne Andree |
female |
3.00 |
1 |
2 |
SC/Paris 2123 |
41.5792 |
NaN |
C |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
827 |
828 |
1 |
2 |
Mallet, Master. Andre |
male |
1.00 |
0 |
2 |
S.C./PARIS 2079 |
37.0042 |
NaN |
C |
831 |
832 |
1 |
2 |
Richards, Master. George Sibley |
male |
0.83 |
1 |
1 |
29106 |
18.7500 |
NaN |
S |
850 |
851 |
0 |
3 |
Andersson, Master. Sigvard Harald Elias |
male |
4.00 |
4 |
2 |
347082 |
31.2750 |
NaN |
S |
852 |
853 |
0 |
3 |
Boulos, Miss. Nourelain |
female |
9.00 |
1 |
1 |
2678 |
15.2458 |
NaN |
C |
869 |
870 |
1 |
3 |
Johnson, Master. Harold Theodor |
male |
4.00 |
1 |
1 |
347742 |
11.1333 |
NaN |
S |
62 rows × 12 columns
任务二: 以"Age"为条件,将年龄在10岁以上和50岁以下的乘客信息显示出来,并将这个数据命名为midage
midage = df[(df['Age']>10)&(df['Age']<50)]
midage.head()
|
PassengerId |
Survived |
Pclass |
Name |
Sex |
Age |
SibSp |
Parch |
Ticket |
Fare |
Cabin |
Embarked |
0 |
1 |
0 |
3 |
Braund, Mr. Owen Harris |
male |
22.0 |
1 |
0 |
A/5 21171 |
7.2500 |
NaN |
S |
1 |
2 |
1 |
1 |
Cumings, Mrs. John Bradley (Florence Briggs Th... |
female |
38.0 |
1 |
0 |
PC 17599 |
71.2833 |
C85 |
C |
2 |
3 |
1 |
3 |
Heikkinen, Miss. Laina |
female |
26.0 |
0 |
0 |
STON/O2. 3101282 |
7.9250 |
NaN |
S |
3 |
4 |
1 |
1 |
Futrelle, Mrs. Jacques Heath (Lily May Peel) |
female |
35.0 |
1 |
0 |
113803 |
53.1000 |
C123 |
S |
4 |
5 |
0 |
3 |
Allen, Mr. William Henry |
male |
35.0 |
0 |
0 |
373450 |
8.0500 |
NaN |
S |
连接两个逻辑条件需要用括号括起来
任务三:将midage的数据中第100行的"Pclass"和"Sex"的数据显示出来
print(midage.iloc[100]['Pclass'])
print(midage.iloc[100]['Sex'])
2
male
还可以写作 midage.loc[[100],[‘Pclass’,‘Sex’]]
任务四:使用loc方法将midage的数据中第100,105,108行的"Pclass","Name"和"Sex"的数据显示出来
midage.loc[[100,105,108],['Pclass','Name','Sex']]
|
Pclass |
Name |
Sex |
100 |
3 |
Petranec, Miss. Matilda |
female |
105 |
3 |
Mionoff, Mr. Stoytcho |
male |
108 |
3 |
Rekic, Mr. Tido |
male |
任务五:使用iloc方法将midage的数据中第100,105,108行的"Pclass","Name"和"Sex"的数据显示出来
midage.iloc[[100,105,108],[2,3,4]]
|
Pclass |
Name |
Sex |
149 |
2 |
Byles, Rev. Thomas Roussel Davids |
male |
160 |
3 |
Cribb, Mr. John Hatfield |
male |
163 |
3 |
Calic, Mr. Jovo |
male |
3.1开始之前,导入numpy、pandas包和数据
text = pd.read_csv('train.chinese.csv')
text.head()
|
乘客ID |
是否幸存 |
仓位等级 |
姓名 |
性别 |
年龄 |
兄弟姐妹个数 |
父母子女个数 |
船票信息 |
票价 |
客舱 |
登船港口 |
0 |
1 |
0 |
3 |
Braund, Mr. Owen Harris |
male |
22.0 |
1 |
0 |
A/5 21171 |
7.2500 |
NaN |
S |
1 |
2 |
1 |
1 |
Cumings, Mrs. John Bradley (Florence Briggs Th... |
female |
38.0 |
1 |
0 |
PC 17599 |
71.2833 |
C85 |
C |
2 |
3 |
1 |
3 |
Heikkinen, Miss. Laina |
female |
26.0 |
0 |
0 |
STON/O2. 3101282 |
7.9250 |
NaN |
S |
3 |
4 |
1 |
1 |
Futrelle, Mrs. Jacques Heath (Lily May Peel) |
female |
35.0 |
1 |
0 |
113803 |
53.1000 |
C123 |
S |
4 |
5 |
0 |
3 |
Allen, Mr. William Henry |
male |
35.0 |
0 |
0 |
373450 |
8.0500 |
NaN |
S |
任务一:利用Pandas对示例数据进行排序,要求升序
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
index=['2', '1'],
columns=['d', 'a', 'b', 'c'])
frame.sort_index()
|
d |
a |
b |
c |
1 |
4 |
5 |
6 |
7 |
2 |
0 |
1 |
2 |
3 |
frame.sort_index(axis=1)
|
a |
b |
c |
d |
2 |
1 |
2 |
3 |
0 |
1 |
5 |
6 |
7 |
4 |
frame.sort_index(axis=1,ascending=False)
|
d |
c |
b |
a |
2 |
0 |
3 |
2 |
1 |
1 |
4 |
7 |
6 |
5 |
frame.sort_values(by=['a','c'])
|
d |
a |
b |
c |
2 |
0 |
1 |
2 |
3 |
1 |
4 |
5 |
6 |
7 |
任务二:对泰坦尼克号数据(trian.csv)按票价和年龄两列进行综合排序(降序排列),从数据中你能发现什么
df.sort_values(['Age','Fare'],ascending=False)
|
PassengerId |
Survived |
Pclass |
Name |
Sex |
Age |
SibSp |
Parch |
Ticket |
Fare |
Cabin |
Embarked |
630 |
631 |
1 |
1 |
Barkworth, Mr. Algernon Henry Wilson |
male |
80.0 |
0 |
0 |
27042 |
30.0000 |
A23 |
S |
851 |
852 |
0 |
3 |
Svensson, Mr. Johan |
male |
74.0 |
0 |
0 |
347060 |
7.7750 |
NaN |
S |
493 |
494 |
0 |
1 |
Artagaveytia, Mr. Ramon |
male |
71.0 |
0 |
0 |
PC 17609 |
49.5042 |
NaN |
C |
96 |
97 |
0 |
1 |
Goldschmidt, Mr. George B |
male |
71.0 |
0 |
0 |
PC 17754 |
34.6542 |
A5 |
C |
116 |
117 |
0 |
3 |
Connors, Mr. Patrick |
male |
70.5 |
0 |
0 |
370369 |
7.7500 |
NaN |
Q |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
481 |
482 |
0 |
2 |
Frost, Mr. Anthony Wood "Archie" |
male |
NaN |
0 |
0 |
239854 |
0.0000 |
NaN |
S |
633 |
634 |
0 |
1 |
Parr, Mr. William Henry Marsh |
male |
NaN |
0 |
0 |
112052 |
0.0000 |
NaN |
S |
674 |
675 |
0 |
2 |
Watson, Mr. Ennis Hastings |
male |
NaN |
0 |
0 |
239856 |
0.0000 |
NaN |
S |
732 |
733 |
0 |
2 |
Knight, Mr. Robert J |
male |
NaN |
0 |
0 |
239855 |
0.0000 |
NaN |
S |
815 |
816 |
0 |
1 |
Fry, Mr. Richard |
male |
NaN |
0 |
0 |
112058 |
0.0000 |
B102 |
S |
891 rows × 12 columns
df.sort_values(['Fare','Age'],ascending=False)
|
PassengerId |
Survived |
Pclass |
Name |
Sex |
Age |
SibSp |
Parch |
Ticket |
Fare |
Cabin |
Embarked |
679 |
680 |
1 |
1 |
Cardeza, Mr. Thomas Drake Martinez |
male |
36.0 |
0 |
1 |
PC 17755 |
512.3292 |
B51 B53 B55 |
C |
258 |
259 |
1 |
1 |
Ward, Miss. Anna |
female |
35.0 |
0 |
0 |
PC 17755 |
512.3292 |
NaN |
C |
737 |
738 |
1 |
1 |
Lesurer, Mr. Gustave J |
male |
35.0 |
0 |
0 |
PC 17755 |
512.3292 |
B101 |
C |
438 |
439 |
0 |
1 |
Fortune, Mr. Mark |
male |
64.0 |
1 |
4 |
19950 |
263.0000 |
C23 C25 C27 |
S |
341 |
342 |
1 |
1 |
Fortune, Miss. Alice Elizabeth |
female |
24.0 |
3 |
2 |
19950 |
263.0000 |
C23 C25 C27 |
S |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
481 |
482 |
0 |
2 |
Frost, Mr. Anthony Wood "Archie" |
male |
NaN |
0 |
0 |
239854 |
0.0000 |
NaN |
S |
633 |
634 |
0 |
1 |
Parr, Mr. William Henry Marsh |
male |
NaN |
0 |
0 |
112052 |
0.0000 |
NaN |
S |
674 |
675 |
0 |
2 |
Watson, Mr. Ennis Hastings |
male |
NaN |
0 |
0 |
239856 |
0.0000 |
NaN |
S |
732 |
733 |
0 |
2 |
Knight, Mr. Robert J |
male |
NaN |
0 |
0 |
239855 |
0.0000 |
NaN |
S |
815 |
816 |
0 |
1 |
Fry, Mr. Richard |
male |
NaN |
0 |
0 |
112058 |
0.0000 |
B102 |
S |
891 rows × 12 columns
任务三:利用Pandas进行算术计算,计算两个DataFrame数据相加结果
frame1_a = pd.DataFrame(np.arange(9.).reshape(3, 3),
columns=['a', 'b', 'c'],
index=['one', 'two', 'three'])
frame1_b = pd.DataFrame(np.arange(12.).reshape(4, 3),
columns=['a', 'e', 'c'],
index=['first', 'one', 'two', 'second'])
frame1_a
|
a |
b |
c |
one |
0.0 |
1.0 |
2.0 |
two |
3.0 |
4.0 |
5.0 |
three |
6.0 |
7.0 |
8.0 |
frame1_b
|
a |
e |
c |
first |
0.0 |
1.0 |
2.0 |
one |
3.0 |
4.0 |
5.0 |
two |
6.0 |
7.0 |
8.0 |
second |
9.0 |
10.0 |
11.0 |
frame1_a + frame1_b
|
a |
b |
c |
e |
first |
NaN |
NaN |
NaN |
NaN |
one |
3.0 |
NaN |
7.0 |
NaN |
second |
NaN |
NaN |
NaN |
NaN |
three |
NaN |
NaN |
NaN |
NaN |
two |
9.0 |
NaN |
13.0 |
NaN |
两个DataFrame相加后,会返回一个新的DataFrame,对应的行和列的值会相加,没有对应的会变成空值NaN。
任务四:通过泰坦尼克号数据如何计算出在船上最大的家族有多少人?
max(text['兄弟姐妹个数']+text['父母子女个数'])
10
任务五:学会使用Pandasdescribe()函数查看数据基本统计信息
frame2 = pd.DataFrame([[1.4, np.nan],
[7.1, -4.5],
[np.nan, np.nan],
[0.75, -1.3]
], index=['a', 'b', 'c', 'd'], columns=['one', 'two'])
frame2
|
one |
two |
a |
1.40 |
NaN |
b |
7.10 |
-4.5 |
c |
NaN |
NaN |
d |
0.75 |
-1.3 |
frame.describe()
|
d |
a |
b |
c |
count |
2.000000 |
2.000000 |
2.000000 |
2.000000 |
mean |
2.000000 |
3.000000 |
4.000000 |
5.000000 |
std |
2.828427 |
2.828427 |
2.828427 |
2.828427 |
min |
0.000000 |
1.000000 |
2.000000 |
3.000000 |
25% |
1.000000 |
2.000000 |
3.000000 |
4.000000 |
50% |
2.000000 |
3.000000 |
4.000000 |
5.000000 |
75% |
3.000000 |
4.000000 |
5.000000 |
6.000000 |
max |
4.000000 |
5.000000 |
6.000000 |
7.000000 |
count : 样本数据大小
mean : 样本数据的平均值
std : 样本数据的标准差
min : 样本数据的最小值
25% : 样本数据25%的时候的值
50% : 样本数据50%的时候的值
75% : 样本数据75%的时候的值
max : 样本数据的最大值
任务六:分别看看泰坦尼克号数据集中 票价、父母子女 这列数据的基本统计数据,你能发现什么?
text['票价'].describe()
count 891.000000
mean 32.204208
std 49.693429
min 0.000000
25% 7.910400
50% 14.454200
75% 31.000000
max 512.329200
Name: 票价, dtype: float64
text['父母子女个数'].describe()
count 891.000000
mean 0.381594
std 0.806057
min 0.000000
25% 0.000000
50% 0.000000
75% 0.000000
max 6.000000
Name: 父母子女个数, dtype: float64
``