利用Python进行数据分析的学习笔记——chap6

读写文本格式的数据

利用Python进行数据分析的学习笔记——chap6_第1张图片

import pandas as pd
import numpy as np
from pandas import Series,DataFrame
!type "E:\python_study_files\python\pydata-book-2nd-edition\examples\ex1.csv"
a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo
df = pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex1.csv")
#等价于
df = pd.read_table("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex1.csv",sep=',')
#对于无标题行文件
pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex2.csv",header=None)
0 1 2 3 4
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
#自定义列名
pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex2.csv",names=['a','b','c','d','message'])
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
names=['a','b','c','d','message']
pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex2.csv",names=names,index_col='message')
a b c d
message
hello 1 2 3 4
world 5 6 7 8
foo 9 10 11 12
#做一个层次化索引。传入由列编号或列名组成的列表
parsed = pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\csv_mindex.csv",index_col=['key1','key2'])
parsed
value1 value2
key1 key2
one a 1 2
b 3 4
c 5 6
d 7 8
two a 9 10
b 11 12
c 13 14
d 15 16
list(open("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex3.txt"))
['            A         B         C\n',
 'aaa -0.264438 -1.026059 -0.619500\n',
 'bbb  0.927272  0.302904 -0.032399\n',
 'ccc -0.264273 -0.386314 -0.217601\n',
 'ddd -0.871858 -0.348382  1.100491\n']
#文件各个字段由数量不定的空白符分隔
#可以用正则表达式\s+
result = pd.read_table("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex3.txt",sep='\s+')
result
A B C
aaa -0.264438 -1.026059 -0.619500
bbb 0.927272 0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871858 -0.348382 1.100491
#读取文件的跳行操作
pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex4.csv",skiprows=[0,2,3])
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
#含缺失值的数据
result = pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex5.csv")
result
something a b c d message
0 one 1 2 3.0 4 NaN
1 two 5 6 NaN 8 world
2 three 9 10 11.0 12 foo
pd.isnull(result)
something a b c d message
0 False False False False False True
1 False False False True False False
2 False False False False False False

pandas会用一组经常出现的标记值进行识别。如NA、-1.#IND以及NULL和空字符串等

#na_values可以接受一组用于表示缺失值的字符串
result = pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex5.csv",na_values=['NULL'])
pd.isnull(result)
something a b c d message
0 False False False False False True
1 False False False True False True
2 False False False False False False
#可以用一个字典为各列指定不同的NA标记值
sentinels = {'message':['foo','NA'],'something':['two']}
pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex5.csv",na_values=sentinels)
something a b c d message
0 one 1 2 3.0 4 NaN
1 NaN 5 6 NaN 8 world
2 three 9 10 11.0 12 NaN
#有个知识点
import matplotlib.pyplot as plt
from pylab import *
img = plt.imread('read_csv或read_table函数的参数.png')
imshow(img)

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-etaY4igA-1645969826326)(output_17_1.png)]

img = plt.imread('read_csv或read_table函数的参数2.png')
imshow(img)

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-WniQIXjq-1645969826327)(output_18_1.png)]

逐块读取文本文件

result = pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex6.csv")
#读取前几行
pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex6.csv",nrows=5)
one two three four key
0 0.467976 -0.038649 -0.295344 -1.824726 L
1 -0.358893 1.404453 0.704965 -0.200638 B
2 -0.501840 0.659254 -0.421691 -0.057688 G
3 0.204886 1.074134 1.388361 -0.982404 R
4 0.354628 -0.133116 0.283763 -0.837063 Q
#逐块读取文件
chunker = pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex6.csv",chunksize=1000)
chunker

#返回的是TextParser对象,对文件进行逐块迭代
#将值计数聚合到key列
tot = Series([])
for piece in chunker:
    #add() 方法向集合添加元素。如果该元素已存在,则 add() 方法就不会添加元素。计数函数value_counts()
    tot = tot.add(piece['key'].value_counts(),fill_value=0)

tot = tot.sort_values(ascending=False)
tot[:10]
E    368.0
X    364.0
L    346.0
O    343.0
Q    340.0
M    338.0
J    337.0
F    335.0
K    334.0
H    330.0
dtype: float64

get_chunk方法可以读取任意大小的块

将数据写出到文本格式

data = pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex5.csv")
#将数据写到一个以逗号分隔的文件中
data.to_csv("E:\python_study_files\ipython_data_analysis\out.csv")
import sys
data.to_csv(sys.stdout,sep='|')
|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo
data.to_csv(sys.stdout,na_rep='NULL')
,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo
data.to_csv(sys.stdout,index=False,header=False)
one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo
data.to_csv(sys.stdout,index=False,columns=['a','b','c'])
a,b,c
1,2,3.0
5,6,
9,10,11.0
9,10,11.0
#Series也有一个to_csv方法
#生成时间2000/1/1-2000/1/7
dates = pd.date_range('1/1/2000',periods=7)
ts = Series(np.arange(7),index=dates)
ts.to_csv("E:\\python_study_files\\ipython_data_analysis\\tseries.csv")
!type "E:\python_study_files\ipython_data_analysis\tseries.csv"
,0
2000-01-01,0
2000-01-02,1
2000-01-03,2
2000-01-04,3
2000-01-05,4
2000-01-06,5
2000-01-07,6
dates
DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07'],
              dtype='datetime64[ns]', freq='D')
#read_csv等价于
#Series.from_csv("E:\python_study_files\ipython_data_analysis\tseries.csv",parse_dates=True)
#from_csv无了,用回read_csv
pd.read_csv("E:\\python_study_files\\ipython_data_analysis\\tseries.csv",parse_dates=True)
Unnamed: 0 0
0 2000-01-01 0
1 2000-01-02 1
2 2000-01-03 2
3 2000-01-04 3
4 2000-01-05 4
5 2000-01-06 5
6 2000-01-07 6

手工处理分隔符格式

!type "E:\python_study_files\python\pydata-book-2nd-edition\examples\ex7.csv"
"a","b","c"
"1","2","3"
"1","2","3"
import csv
f = open("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex7.csv")
reader = csv.reader(f)#reader = csv.reader(f,delimiter=',')
reader
<_csv.reader at 0x1c28677b160>
for line in reader:
    print(line)
['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3']
lines = list(csv.reader(open("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex7.csv")))
header,values = lines[0],lines[1:]
data_dict = {h: v for h, v in zip(header,zip(*values))}
data_dict
{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}
zip(*values)

利用Python进行数据分析的学习笔记——chap6_第2张图片

with open('mydata.csv','w') as f:
    writer = csv.writer(f,dialect=my_dialect)
    writer.writerow(('one','two','three'))
    writer.writerow(('1','2','3'))
---------------------------------------------------                  


NameError: name 'my_dialect' is not defined

JSON数据

obj =  """
{"name":"Wes",
"places_lived":["United States","Spain","Germany"],
"pet":null,
"siblings":[{"name":"Scott","age":25,"pet":"Zuko"},
           {"name":"Katie","age":33,"pet":"Cisco"}]
}
"""
import json
#将json对象转化为python对象
result = json.loads(obj)
result
{'name': 'Wes',
 'places_lived': ['United States', 'Spain', 'Germany'],
 'pet': None,
 'siblings': [{'name': 'Scott', 'age': 25, 'pet': 'Zuko'},
  {'name': 'Katie', 'age': 33, 'pet': 'Cisco'}]}
#将python对象转换为json对象
asjson = json.dumps(result)
siblings = DataFrame(result['siblings'],columns=['name','age'])
siblings
name age
0 Scott 25
1 Katie 33

XML和HTML:Web信息收集

from lxml.html import parse
import urllib.request
parsed = parse(urllib.request.urlopen('http://www.stats.gov.cn/tjsj/ndsj/2021/indexch.htm'))
doc = parsed.getroot()
#获取特定类型的所有HTML标签
links = doc.findall('.//a')
links
[]
#要得到URL和链接文本,可使用各对象的get方法(针对URL)和text_content方法(针对显示文本)
lnk = links[28]
lnk.get('href')
lnk.text_content()
#用列表推导式获取文档中的全部URL
urls = [lnk.get('href') for lnk in doc.findall('.//a')]
#TextParser类可用于自动类型转换
from pandas.io.parsers import TextParser
from lxml import objectify
path = 'Performance_MNR.xml'
parsed = objectify.parse(open(path))
root = parsed.getroot()
data = []
skip_fields = ['PARENT_SEQ','INDICATOR_SEQ','DESIRED_CHANGE','DECIMAL_PLACES']
#root.INDICATOR返回一个用于产生各个XML元素的生成器
for elt in root.INDICATOR:
    el_data = {}
    for child in elt.getchildren():
        if child.tag in skip_fields:
            continue
        el_data[child.tag] = child.pyval
    data.append(el_data)
perf = DataFrame(data)
perf
AGENCY_NAME INDICATOR_NAME DESCRIPTION PERIOD_YEAR PERIOD_MONTH CATEGORY FREQUENCY INDICATOR_UNIT YTD_TARGET YTD_ACTUAL MONTHLY_TARGET MONTHLY_ACTUAL
0 Metro-North Railroad On-Time Performance (West of Hudson) Percent of commuter trains that arrive at thei... 2008 1 Service Indicators M % 95.0 96.9 95.0 96.9
1 Metro-North Railroad On-Time Performance (West of Hudson) Percent of commuter trains that arrive at thei... 2008 2 Service Indicators M % 95.0 96.0 95.0 95.0
2 Metro-North Railroad On-Time Performance (West of Hudson) Percent of commuter trains that arrive at thei... 2008 3 Service Indicators M % 95.0 96.3 95.0 96.9
3 Metro-North Railroad On-Time Performance (West of Hudson) Percent of commuter trains that arrive at thei... 2008 4 Service Indicators M % 95.0 96.8 95.0 98.3
4 Metro-North Railroad On-Time Performance (West of Hudson) Percent of commuter trains that arrive at thei... 2008 5 Service Indicators M % 95.0 96.6 95.0 95.8
... ... ... ... ... ... ... ... ... ... ... ... ...
643 Metro-North Railroad Escalator Availability Percent of the time that escalators are operat... 2011 8 Service Indicators M % 97.0 97.0
644 Metro-North Railroad Escalator Availability Percent of the time that escalators are operat... 2011 9 Service Indicators M % 97.0 97.0
645 Metro-North Railroad Escalator Availability Percent of the time that escalators are operat... 2011 10 Service Indicators M % 97.0 97.0
646 Metro-North Railroad Escalator Availability Percent of the time that escalators are operat... 2011 11 Service Indicators M % 97.0 97.0
647 Metro-North Railroad Escalator Availability Percent of the time that escalators are operat... 2011 12 Service Indicators M % 97.0 97.0

648 rows × 12 columns

二进制数据格式

pickle最好只用于短期存储,难以保证该格式永远稳定。

frame = pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex1.csv")
frame
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
#frame.save('frame_pickle')
#只需要将save用to_pickle方法代替即可。同理,load要替换为read_pickle
frame.to_pickle('frame_pickle')#二进制格式化
#读取二进制数据
pd.read_pickle('frame_pickle')
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo

使用HDF5格式

如果需要处理海量数据,最好看看PyTables和h5py,因为很多数据分析问题都是IO密集型(不是CPU密集型)。但HDF5适合用作“一次写多次读”的数据集。

store = pd.HDFStore('mydata.h5')
store['obj1'] = frame
store['obj1_col'] = frame['a']
store
---------------------------------------------------------------------------


ImportError: Missing optional dependency 'tables'.  Use pip or conda to install tables.

读取Microsoft Excel文件

import openpyxl
xls_file = pd.ExcelFile('data.xlsx')
table = xls_file.parse('Sheet1')

使用HTML和Web API

import requests
url = 'http://www.json.org.cn/resource/json-in-javascript.htm'
resp = requests.get(url)
resp

import json
data = json.loads(resp.text)
data.keys()
---------------------------------------------------------------------------


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

使用数据库

import sqlite3
query = """
CREATE TABLE test
(a VARCHAR(20),b VARCHAR(20),
c REAL,        d INTEGER
);"""
con = sqlite3.connect(':memory:')
con.execute(query)
con.commit()
#插入几行数据
data = [('Atlanta','Georgia',1.25,6),
       ('Tallahassee','Florida',2.6,3),
       ('Sacramento','California',1.7,5)]
stmt = "INSERT INTO test VALUES(?,?,?,?)"

con.executemany(stmt,data)
con.commit()
cursor = con.execute('select * from test')
rows = cursor.fetchall()
rows
[('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5)]
cursor.description
(('a', None, None, None, None, None, None),
 ('b', None, None, None, None, None, None),
 ('c', None, None, None, None, None, None),
 ('d', None, None, None, None, None, None))
DataFrame(rows,columns=zip(*cursor.description)[0])
---------------------------------------------------------------------------


TypeError: 'zip' object is not subscriptable

存取MongoDB中的数据

#import pymongo
#con = pymongo.Connection('localhost',port=27017)改了
from pymongo import MongoClient
con = MongoClient('localhost',port=27017)

你可能感兴趣的:(笔记,python,数据挖掘,机器学习)