在这里我直接导入的jupyter notebook文件
数据集下载 https://www.kaggle.com/c/titanic/overview
#写入代码
import numpy as np
import pandas as pd
【提示】如果加载失败,学会如何在你的python环境下安装numpy和pandas这两个库
(1) 使用相对路径载入数据
(2) 使用绝对路径载入数据
#写入代码
#1.通过标准的Python库导入CSV文件
from csv import reader
filename=input("请输入文件名: ")
with open(filename,'rt',encoding='UTF-8') as raw_data:
readers=reader(raw_data,delimiter=',')
#delimiter参数为分隔符
x=list(readers)
data=np.array(x)
print(data)
print(data.shape)
请输入文件名: train.csv
[['PassengerId' 'Survived' 'Pclass' ... 'Fare' 'Cabin' 'Embarked']
['1' '0' '3' ... '7.25' '' 'S']
['2' '1' '1' ... '71.2833' 'C85' 'C']
...
['889' '0' '3' ... '23.45' '' 'S']
['890' '1' '1' ... '30' 'C148' 'C']
['891' '0' '3' ... '7.75' '' 'Q']]
(892, 12)
#2、通过NumPy导入CSV文件——会出现报错,应该是有空数据等原因
from numpy import loadtxt
filename=input("文件名:")
with open(filename,encoding='UTF-8')as raw_data:
data=loadtxt(raw_data,delimiter=',')
print(data)
文件名:train.csv
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
in
3 filename=input("文件名:")
4 with open(filename,encoding='UTF-8')as raw_data:
----> 5 data=loadtxt(raw_data,delimiter=',')
6 print(data)
~\Anaconda3\lib\site-packages\numpy\lib\npyio.py in loadtxt(fname, dtype, comments, delimiter, converters, skiprows, usecols, unpack, ndmin, encoding, max_rows)
1139 # converting the data
1140 X = None
-> 1141 for x in read_data(_loadtxt_chunksize):
1142 if X is None:
1143 X = np.array(x, dtype)
~\Anaconda3\lib\site-packages\numpy\lib\npyio.py in read_data(chunk_size)
1066
1067 # Convert each value according to its column and store
-> 1068 items = [conv(val) for (conv, val) in zip(converters, vals)]
1069
1070 # Then pack it according to the dtype's nesting
~\Anaconda3\lib\site-packages\numpy\lib\npyio.py in (.0)
1066
1067 # Convert each value according to its column and store
-> 1068 items = [conv(val) for (conv, val) in zip(converters, vals)]
1069
1070 # Then pack it according to the dtype's nesting
~\Anaconda3\lib\site-packages\numpy\lib\npyio.py in floatconv(x)
773 if '0x' in x:
774 return float.fromhex(x)
--> 775 return float(x)
776
777 typ = dtype.type
ValueError: could not convert string to float: 'PassengerId'
from pandas import read_csv
filename=input("文件名:")
f=open(filename,encoding='UTF-8')
data=read_csv(f)
print(data)
文件名:train.csv
PassengerId Survived Pclass \
0 1 0 3
1 2 1 1
2 3 1 3
3 4 1 1
4 5 0 3
.. ... ... ...
886 887 0 2
887 888 1 1
888 889 0 3
889 890 1 1
890 891 0 3
Name Sex Age SibSp \
0 Braund, Mr. Owen Harris male 22.0 1
1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1
2 Heikkinen, Miss. Laina female 26.0 0
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1
4 Allen, Mr. William Henry male 35.0 0
.. ... ... ... ...
886 Montvila, Rev. Juozas male 27.0 0
887 Graham, Miss. Margaret Edith female 19.0 0
888 Johnston, Miss. Catherine Helen "Carrie" female NaN 1
889 Behr, Mr. Karl Howell male 26.0 0
890 Dooley, Mr. Patrick male 32.0 0
Parch Ticket Fare Cabin Embarked
0 0 A/5 21171 7.2500 NaN S
1 0 PC 17599 71.2833 C85 C
2 0 STON/O2. 3101282 7.9250 NaN S
3 0 113803 53.1000 C123 S
4 0 373450 8.0500 NaN S
.. ... ... ... ... ...
886 0 211536 13.0000 NaN S
887 0 112053 30.0000 B42 S
888 2 W./C. 6607 23.4500 NaN S
889 0 111369 30.0000 C148 C
890 0 370376 7.7500 NaN Q
[891 rows x 12 columns]
#写入代码
#相对地址读入
df = pd.read_csv('train.csv')
df.head(3)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
#绝对地址读
df = pd.read_csv('E:/juperter notebook/动手学数据分析\第一单元项目集合/train.csv')
df.head(3)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
【提示】相对路径载入报错时,尝试使用os.getcwd()查看当前工作目录。
【思考】知道数据加载的方法后,试试pd.read_csv()和pd.read_table()的不同,如果想让他们效果一样,需要怎么做?了解一下’.tsv’和’.csv’的不同,如何加载这两个数据集?
【总结】加载的数据是所有工作的第一步,我们的工作会接触到不同的数据格式(eg:.csv;.tsv;.xlsx),但是加载的方法和思路都是一样的,在以后工作和做项目的过程中,遇到之前没有碰到的问题,要多多查资料吗,使用googel,了解业务逻辑,明白输入和输出是什么。
#写入代码
chunker = pd.read_csv('train.csv', chunksize=1000)
【思考】什么是逐块读取?为什么要逐块读取呢?
有时候文件太大了,需要分开读取数据,每次读取一小部分。
for piece in chunker:
print(type(piece))
print(len(piece))
891
PassengerId => 乘客ID
Survived => 是否幸存
Pclass => 乘客等级(1/2/3等舱位)
Name => 乘客姓名
Sex => 性别
Age => 年龄
SibSp => 堂兄弟/妹个数
Parch => 父母与小孩个数
Ticket => 船票信息
Fare => 票价
Cabin => 客舱
Embarked => 登船港口
#写入代码
names=['乘客ID','是否幸存','仓位等级','姓名','性别','年龄','兄弟姐妹个数','父母子女个数','船票信息','票价','客舱','登船港口']
index_col='乘客ID'
df = pd.read_csv('train.csv', names=names,index_col=index_col,header=0)
df.head()
是否幸存 | 仓位等级 | 姓名 | 性别 | 年龄 | 兄弟姐妹个数 | 父母子女个数 | 船票信息 | 票价 | 客舱 | 登船港口 | |
---|---|---|---|---|---|---|---|---|---|---|---|
乘客ID | |||||||||||
1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
【思考】所谓将表头改为中文其中一个思路是:将英文额度表头替换成中文。还有其他的方法吗?
导入数据后,你可能要对数据的整体结构和样例进行概览,比如说,数据大小、有多少列,各列都是什么格式的,是否包含null等
#写入代码
df.describe()
#https://www.cnblogs.com/ffli/p/12201448.html
是否幸存 | 仓位等级 | 年龄 | 兄弟姐妹个数 | 父母子女个数 | 票价 | |
---|---|---|---|---|---|---|
count | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
mean | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
std | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
min | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
25% | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
50% | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
max | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
df.info()
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
是否幸存 891 non-null int64
仓位等级 891 non-null int64
姓名 891 non-null object
性别 891 non-null object
年龄 714 non-null float64
兄弟姐妹个数 891 non-null int64
父母子女个数 891 non-null int64
船票信息 891 non-null object
票价 891 non-null float64
客舱 204 non-null object
登船港口 889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB
【提示】有多个函数可以这样做,你可以做一下总结
#写入代码
df.head(10)
是否幸存 | 仓位等级 | 姓名 | 性别 | 年龄 | 兄弟姐妹个数 | 父母子女个数 | 船票信息 | 票价 | 客舱 | 登船港口 | |
---|---|---|---|---|---|---|---|---|---|---|---|
乘客ID | |||||||||||
1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
6 | 0 | 3 | Moran, Mr. James | male | NaN | 0 | 0 | 330877 | 8.4583 | NaN | Q |
7 | 0 | 1 | McCarthy, Mr. Timothy J | male | 54.0 | 0 | 0 | 17463 | 51.8625 | E46 | S |
8 | 0 | 3 | Palsson, Master. Gosta Leonard | male | 2.0 | 3 | 1 | 349909 | 21.0750 | NaN | S |
9 | 1 | 3 | Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) | female | 27.0 | 0 | 2 | 347742 | 11.1333 | NaN | S |
10 | 1 | 2 | Nasser, Mrs. Nicholas (Adele Achem) | female | 14.0 | 1 | 0 | 237736 | 30.0708 | NaN | C |
#写入代码
df.tail(15)
是否幸存 | 仓位等级 | 姓名 | 性别 | 年龄 | 兄弟姐妹个数 | 父母子女个数 | 船票信息 | 票价 | 客舱 | 登船港口 | |
---|---|---|---|---|---|---|---|---|---|---|---|
乘客ID | |||||||||||
877 | 0 | 3 | Gustafsson, Mr. Alfred Ossian | male | 20.0 | 0 | 0 | 7534 | 9.8458 | NaN | S |
878 | 0 | 3 | Petroff, Mr. Nedelio | male | 19.0 | 0 | 0 | 349212 | 7.8958 | NaN | S |
879 | 0 | 3 | Laleff, Mr. Kristo | male | NaN | 0 | 0 | 349217 | 7.8958 | NaN | S |
880 | 1 | 1 | Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) | female | 56.0 | 0 | 1 | 11767 | 83.1583 | C50 | C |
881 | 1 | 2 | Shelley, Mrs. William (Imanita Parrish Hall) | female | 25.0 | 0 | 1 | 230433 | 26.0000 | NaN | S |
882 | 0 | 3 | Markun, Mr. Johann | male | 33.0 | 0 | 0 | 349257 | 7.8958 | NaN | S |
883 | 0 | 3 | Dahlberg, Miss. Gerda Ulrika | female | 22.0 | 0 | 0 | 7552 | 10.5167 | NaN | S |
884 | 0 | 2 | Banfield, Mr. Frederick James | male | 28.0 | 0 | 0 | C.A./SOTON 34068 | 10.5000 | NaN | S |
885 | 0 | 3 | Sutehall, Mr. Henry Jr | male | 25.0 | 0 | 0 | SOTON/OQ 392076 | 7.0500 | NaN | S |
886 | 0 | 3 | Rice, Mrs. William (Margaret Norton) | female | 39.0 | 0 | 5 | 382652 | 29.1250 | NaN | Q |
887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | NaN | S |
888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S |
889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | NaN | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S |
890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C |
891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | NaN | Q |
#写入代码
df.isnull()
是否幸存 | 仓位等级 | 姓名 | 性别 | 年龄 | 兄弟姐妹个数 | 父母子女个数 | 船票信息 | 票价 | 客舱 | 登船港口 | |
---|---|---|---|---|---|---|---|---|---|---|---|
乘客ID | |||||||||||
1 | False | False | False | False | False | False | False | False | False | True | False |
2 | False | False | False | False | False | False | False | False | False | False | False |
3 | False | False | False | False | False | False | False | False | False | True | False |
4 | False | False | False | False | False | False | False | False | False | False | False |
5 | False | False | False | False | False | False | False | False | False | True | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
887 | False | False | False | False | False | False | False | False | False | True | False |
888 | False | False | False | False | False | False | False | False | False | False | False |
889 | False | False | False | False | True | False | False | False | False | True | False |
890 | False | False | False | False | False | False | False | False | False | False | False |
891 | False | False | False | False | False | False | False | False | False | True | False |
891 rows × 11 columns
【总结】上面的操作都是数据分析中对于数据本身的观察
【思考】对于一个数据,还可以从哪些方面来观察?找找答案,这个将对下面的数据分析有很大的帮助
#写入代码
df.to_csv('train_chinese.csv')
【总结】数据的加载以及入门,接下来就要接触数据本身的运算,我们将主要掌握numpy和pandas在工作和项目场景的运用。