利用Python进行分析-Chapter 3

Python内置数据类型结构：Tuple、List、dict、set

Tuple（元组）

元组的初始化

tup = 4, 5, 6
nested_tup = (4, 5, 6), (7, 8)

List、string转化为Tuple
```
tuple([4, 0, 2])
tup = tuple('string')
```
支持索引访问
```
tup[0]
```

一旦被定义后就无法修改其中元素

tup = tuple(['foo', [1, 2], True])
tup[2] = False （会报错）

如果其中某个元素是一个列表，那么其中那个列表的元素可以修改
```
tup[1].append(3)  -> ('foo', [1, 2, 3], True)
```

Tuple支持连接

(4, None, 'foo') + (6, 0) + ('bar',)  -> (4, None, 'foo', 6, 0, 'bar')

Tunple支持乘法操作，其实际效果是重复相加

('foo', 'bar') * 4 -> ('foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'bar')

Tunple支持解压操作

tup = (4, 5, 6)
a, b, c = tup   #a, b, c就对应Tunple的三个元素
b, a = a, b    #所以Tunple下值的交换是很方便的，如此a,b即完成了值的交换
seq = [(1, 2, 3), (4, 5, 6), (7, 8, 9)]
for a, b, c in seq:
   print('a={0}, b={1}, c={2}'.format(a, b, c))   #a,b,c就对应列表中某一元素的三个值

Tunple支持高级解压缩

values = 1, 2, 3, 4, 5
a, b, *rest = values  # rest=[3,4,5] 意味着可以支持取出任意长度的后续字符

Tunle不支持修改，因而支持众多的查询操作
```
a = (1, 2, 2, 2, 3, 4, 2)
a.count(2) = 4
```

List（列表）

List定义
```
a_list = [2, 3, 7, None]
```

tuple转化为list

tup = ('foo', 'bar', 'baz')
b_list = list(tup)

list支持修改元素
```
b_list[1] = 'peekaboo'
```

list可以取出迭代器元素

gen = range(10)
list(gen) = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

增删元素

b_list.append('dwarf') #在末尾添加
b_list.insert(1, 'red') #在指定位置添加（插队），被顶的那个元素往后排一下
insert比较耗时，如果你需要在头尾都插入元素，则要考虑collections.deque和double-ended queue
b_list.pop(2) #删除第二个位置的元素
b_list.remove('foo')  #删除值为“foo”的元素

判断元素是否在list中

'dwarf' in b_list / 'dwarf' not in b_list

List相连接

[4, None, 'foo'] + [7, 8, (2, 3)]
x = [4, None, 'foo'] 
x.extend([7, 8, (2, 3)])  # extend函数也能起到连接的作用，extend函数要花费更少的时间
everything = []
for chunk in list_of_lists:
   everything.extend(chunk)

List内部排序

a = [7, 2, 5, 1, 3]
a.sort()  #a被排序，sort中有一个参数，如sort(key=len)，即按照字符串的长度排序
import bisect
bisect.bisect(a,2)  #对**已排序的列表**插入一个元素，返回的是插入的位置，并不真正插入
bisect.insortb(a,2)  #真正插入一个值

List切片

seq = [7, 2, 3, 7, 5, 6, 0, 1]
seq[1:5] = [2, 3, 7, 5]   #前闭后开的取值
seq[3:4] = [6, 3] #通过切片集中修改某些位置的值
seq[:5] seq[3:]  #列表首尾皆可忽
seq[-6:-2] #可以符号反向切片，还是左开右闭
seq[::2] #还可以指定步伐。前进两步从头选到尾
seq[::-1]  #如此便可以反转整个列表，是非常聪明的方式**

内置的sequence方案

enumerate（枚举）

for i,value in enumerate(a):
    print(i,value)

enumerate中mapping的妙用

some_list = ['foo', 'bar', 'baz']
mapping = {}
for i, v in enumerate(some_list):
   mapping[v] = i

sorted（排序，支持排序参数，返回的是一个新的列表）

a= [2,6,5,4,7]
sorted(a) #a不变，输出是排序后的
sorted('horse race') #对字符的ASCII码进行排序

zip（打包功能，看代码）

seq1 = ['foo', 'bar', 'baz']
seq2 = ['one', 'two', 'three']
zipped = zip(seq1, seq2)
list(zipped) #[('foo', 'one'), ('bar', 'two'), ('baz', 'three')]
seq3 = [False, True]
list(zip(seq1, seq2, seq3)) #[('foo', 'one', False), ('bar', 'two', True)]
#结果取决于最短的那个seq

#### 与enumerate结合
for i, (a, b) in enumerate(zip(seq1, seq2)):
    print('{0}: {1}, {2}'.format(i, a, b))
    
#### 巧用zip实现unzip
pitchers = [('Nolan', 'Ryan'), ('Roger', 'Clemens')]
first_names, last_names = zip(*pitchers) #first_names =('Nolan', 'Roger')

反转sequence

list(reversed(range(10)))  #[9, 8, 7, 6, 5, 4, 3, 2, 1, 0]

dict（字典）

初始化

empty_dict = {}
d1 = {'a' : 'some value', 'b' : [1, 2, 3, 4]}
d1[7] = 'an integer'

判断是否在dict
```
'b' in d1
```

删除dict元素

del d1[5]  #带一个key
d1.pop('dummy') #返回的是一个键值，d1中删除了这个键值

更新dict内的键值，主要是值

list(d1.keys())  list(d1.values())  #key与value的顺序一致的
d1.update({'b' : 'foo', 'c' : 12})  #

可以使用二元tunple来指定一个dict

mapping = dict(zip(range(5), reversed(range(5))))

设置默认值

value = some_dict.get(key, default_value)
for word in words:
   letter = word[0]
   by_letter.setdefault(letter, []).append(word)
    #{'a': ['apple', 'atom'], 'b': ['bat', 'bar', 'book']}
from collections import defaultdict
by_letter = defaultdict(list)
for word in words:
by_letter[word[0]].append(word)

散列性

hash('string')
hash((1, 2, (2, 3)))
hash((1, 2, [2, 3]))  # wrong 
d[tuple([1, 2, 3])] = 5  # {(1, 2, 3): 5}  可hash

set（没有值的字典）

set([2, 2, 2, 1, 3, 3]) #{1, 2, 3}
a = {1, 2, 3, 4, 5}
b = {3, 4, 5, 6, 7, 8}
a.union(b) #或者a | b
a.intersection(b) #或者a & b
c = a.copy()

###set内元素顺序不一致，并不影响set的相等

高阶用法

优雅化

# string
strings = ['a', 'as', 'bat', 'car', 'dove', 'python']
[x.upper() for x in strings if len(x) > 2]

#dict
loc_mapping = {val : index for index, val in enumerate(strings)}

#set 
unique_lengths = {len(x) for x in strings}
set(map(len, strings))

嵌套优雅

#list
names_of_interest = []
for names in all_data:
enough_es = [name for name in names if name.count('e') >= 2]
names_of_interest.extend(enough_es)
result = [name for names in all_data for name in names
          if name.count('e') >= 2]

#set
some_tuples = [(1, 2, 3), (4, 5, 6), (7, 8, 9)]
flattened = [x for tup in some_tuples for x in tup]#[1, 2, 3, 4, 5, 6, 7, 8, 9]
[[x for x in tup] for tup in some_tuples] #[[1, 2, 3], [4, 5, 6], [7, 8, 9]]
#注意区别

函数

基本函数定义

def my_function(x, y, z=1.5):  # 采用默认值的
   if z > 1:
       return z * (x + y)
   else:
       return z / (x + y)
## 函数的多种调用方式
my_function(5, 6, z=0.7)
my_function(3.14, 7, 3.5)
my_function(10, 20)
my_function(x=5, y=6, z=7)
my_function(y=6, x=5, z=7)

函数的作用域：全局与本地

def bind_a_variable():   #本地内变量定义为全局，但是不鼓励使用这类变量
    global a
    a = []
bind_a_variable()

python函数可以返回多个值

def f():
   a = 5
   b = 6
   c = 7
   return a, b, c
   #  return {'a' : a, 'b' : b, 'c' : c}  返回一个字典
a, b, c = f()
return_value = f()  #return_value就是一个三元元组

做一个object

states = [' Alabama ', 'Georgia!', 'Georgia', 'georgia', 'FlOrIda',
          'south carolina##', 'West virginia?']

##  常规操作
import re
def clean_strings(strings):
   result = []
   for value in strings:
       value = value.strip()
       value = re.sub('[!#?]', '', value)
       value = value.title()
       result.append(value)
   return result

## 把function当成一个object
def remove_punctuation(value):
   return re.sub('[!#?]', '', value)
clean_ops = [str.strip, remove_punctuation, str.title]
def clean_strings(strings, ops):
   result = []
   for value in strings:
       for function in ops:
            value = function(value)
       result.append(value)
   return result    #clean_strings(states, clean_ops)

# map function
for x in map(remove_punctuation, states)
   print(x)

lambda函数

def short_function(x):
   return x * 2          # equiv_anon = lambda x: x * 2

def apply_to_list(some_list, f):
   return [f(x) for x in some_list]
ints = [4, 0, 1, 5, 6]
apply_to_list(ints, lambda x: x * 2)

strings = ['foo', 'card', 'bar', 'aaaa', 'abab']
strings.sort(key=lambda x: len(set(list(x))))#['aaaa', 'foo', 'abab', 'bar', 'card']

def add_numbers(x, y):
   return x + y
add_five = lambda y: add_numbers(5, y)

from functools import partial
add_five = partial(add_numbers, 5)

生成器

def squares(n=10):
   print('Generating squares from 1 to {0}'.format(n ** 2))
   for i in range(1, n + 1):
       yield i ** 2  # 优雅：gen = (x ** 2 for x in range(100)) 生成器
                     # sum(x ** 2 for x in range(100))
                     # dict((i, i **2) for i in range(5))
gen = squares()
for x in gen:
    print(x, end=' ')
    
import itertools
first_letter = lambda x: x[0]
names = ['Alan', 'Adam', 'Wes', 'Will', 'Albert', 'Steven']
for letter, names in itertools.groupby(names, first_letter):
    print(letter, list(names)) # names is a generator
'''
A ['Alan', 'Adam']
W ['Wes', 'Will']
A ['Albert']
S ['Steven']
'''

[图片上传失败...(image-e82e5-1579059046663)]

报错

def attempt_float(x):
   try:
       return float(x)
   except:   # except (TypeError, ValueError):
       return x 
# 不论正确与否，都要继续执行
f = open(path, 'w')    
try:
   write_to_file(f)
finally:
   f.close()
# 整套流程
f = open(path, 'w')
try:
   write_to_file(f)
except:
   print('Failed')
else:
   print('Succeeded')
finally:
   f.close()

文件操作

path = 'examples/segismundo.txt'
f = open(path) #默认为“r” ，read-only打开
#w，创建新文件，如当前路径有再覆盖；x，创建但不覆盖如有就报错
lines = [x.rstrip() for x in open(path)] # rstrip去掉字符串后面的符号（默认为空格）
f.close()
# 一个简单的方式
with open(path) as f:
    lines = [x.rstrip() for x in f]  #这个会自动关闭f
#读取
f = open(path)
f.read(10)
f.tell() #告知当前读的位置
import sys
sys.getdefaultencoding() # 'utf-8'
f.seek(3) # 3
f.read(1) #定义到当前位置3，再读第3个数
# 写入到文件，按行写
with open('tmp.txt', 'w') as handle:
    handle.writelines(x for x in open(path) if len(x) > 1)
with open('tmp.txt') as f:
    lines = f.readlines()
# 对文件解码
data b'Sue\xc3\xb1a el '
data.decode('utf8') # data[:4].decode('utf8') 不完整则不行，会报错

image

[图片上传失败...(image-d399cd-1579059046663)]

利用Python进行分析-Chapter 3

利用Python进行分析-Chapter 3

你可能感兴趣的:(利用Python进行分析-Chapter 3)