利用Python进行分析-Chapter 3
Python内置数据类型结构:Tuple、List、dict、set
-
Tuple(元组)
-
元组的初始化
tup = 4, 5, 6 nested_tup = (4, 5, 6), (7, 8)
-
List、string转化为Tuple
tuple([4, 0, 2]) tup = tuple('string')
-
支持索引访问
tup[0]
-
一旦被定义后就无法修改其中元素
tup = tuple(['foo', [1, 2], True]) tup[2] = False (会报错)
-
如果其中某个元素是一个列表,那么其中那个列表的元素可以修改
tup[1].append(3) -> ('foo', [1, 2, 3], True)
-
Tuple支持连接
(4, None, 'foo') + (6, 0) + ('bar',) -> (4, None, 'foo', 6, 0, 'bar')
-
Tunple支持乘法操作,其实际效果是重复相加
('foo', 'bar') * 4 -> ('foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'bar')
-
Tunple支持解压操作
tup = (4, 5, 6) a, b, c = tup #a, b, c就对应Tunple的三个元素 b, a = a, b #所以Tunple下值的交换是很方便的,如此a,b即完成了值的交换 seq = [(1, 2, 3), (4, 5, 6), (7, 8, 9)] for a, b, c in seq: print('a={0}, b={1}, c={2}'.format(a, b, c)) #a,b,c就对应列表中某一元素的三个值
-
Tunple支持高级解压缩
values = 1, 2, 3, 4, 5 a, b, *rest = values # rest=[3,4,5] 意味着可以支持取出任意长度的后续字符
-
Tunle不支持修改,因而支持众多的查询操作
a = (1, 2, 2, 2, 3, 4, 2) a.count(2) = 4
-
-
List(列表)
-
List定义
a_list = [2, 3, 7, None]
-
tuple转化为list
tup = ('foo', 'bar', 'baz') b_list = list(tup)
-
list支持修改元素
b_list[1] = 'peekaboo'
-
list可以取出迭代器元素
gen = range(10) list(gen) = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
-
增删元素
b_list.append('dwarf') #在末尾添加 b_list.insert(1, 'red') #在指定位置添加(插队),被顶的那个元素往后排一下 insert比较耗时,如果你需要在头尾都插入元素,则要考虑collections.deque和double-ended queue b_list.pop(2) #删除第二个位置的元素 b_list.remove('foo') #删除值为“foo”的元素
-
判断元素是否在list中
'dwarf' in b_list / 'dwarf' not in b_list
-
List相连接
[4, None, 'foo'] + [7, 8, (2, 3)] x = [4, None, 'foo'] x.extend([7, 8, (2, 3)]) # extend函数也能起到连接的作用,extend函数要花费更少的时间 everything = [] for chunk in list_of_lists: everything.extend(chunk)
-
List内部排序
a = [7, 2, 5, 1, 3] a.sort() #a被排序,sort中有一个参数,如sort(key=len),即按照字符串的长度排序 import bisect bisect.bisect(a,2) #对**已排序的列表**插入一个元素,返回的是插入的位置,并不真正插入 bisect.insortb(a,2) #真正插入一个值
-
List切片
seq = [7, 2, 3, 7, 5, 6, 0, 1] seq[1:5] = [2, 3, 7, 5] #前闭后开的取值 seq[3:4] = [6, 3] #通过切片集中修改某些位置的值 seq[:5] seq[3:] #列表首尾皆可忽 seq[-6:-2] #可以符号反向切片,还是左开右闭 seq[::2] #还可以指定步伐。前进两步从头选到尾 seq[::-1] #如此便可以反转整个列表,是非常聪明的方式**
-
-
内置的sequence方案
-
enumerate(枚举)
for i,value in enumerate(a): print(i,value)
enumerate中mapping的妙用
some_list = ['foo', 'bar', 'baz'] mapping = {} for i, v in enumerate(some_list): mapping[v] = i
-
sorted(排序,支持排序参数,返回的是一个新的列表)
a= [2,6,5,4,7] sorted(a) #a不变,输出是排序后的 sorted('horse race') #对字符的ASCII码进行排序
-
zip(打包功能,看代码)
seq1 = ['foo', 'bar', 'baz'] seq2 = ['one', 'two', 'three'] zipped = zip(seq1, seq2) list(zipped) #[('foo', 'one'), ('bar', 'two'), ('baz', 'three')] seq3 = [False, True] list(zip(seq1, seq2, seq3)) #[('foo', 'one', False), ('bar', 'two', True)] #结果取决于最短的那个seq #### 与enumerate结合 for i, (a, b) in enumerate(zip(seq1, seq2)): print('{0}: {1}, {2}'.format(i, a, b)) #### 巧用zip实现unzip pitchers = [('Nolan', 'Ryan'), ('Roger', 'Clemens')] first_names, last_names = zip(*pitchers) #first_names =('Nolan', 'Roger')
-
反转sequence
list(reversed(range(10))) #[9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
-
-
dict(字典)
-
初始化
empty_dict = {} d1 = {'a' : 'some value', 'b' : [1, 2, 3, 4]} d1[7] = 'an integer'
-
判断是否在dict
'b' in d1
-
删除dict元素
del d1[5] #带一个key d1.pop('dummy') #返回的是一个键值,d1中删除了这个键值
-
更新dict内的键值,主要是值
list(d1.keys()) list(d1.values()) #key与value的顺序一致的 d1.update({'b' : 'foo', 'c' : 12}) #
-
可以使用二元tunple来指定一个dict
mapping = dict(zip(range(5), reversed(range(5))))
-
设置默认值
value = some_dict.get(key, default_value) for word in words: letter = word[0] by_letter.setdefault(letter, []).append(word) #{'a': ['apple', 'atom'], 'b': ['bat', 'bar', 'book']} from collections import defaultdict by_letter = defaultdict(list) for word in words: by_letter[word[0]].append(word)
-
散列性
hash('string') hash((1, 2, (2, 3))) hash((1, 2, [2, 3])) # wrong d[tuple([1, 2, 3])] = 5 # {(1, 2, 3): 5} 可hash
-
-
set(没有值的字典)
set([2, 2, 2, 1, 3, 3]) #{1, 2, 3} a = {1, 2, 3, 4, 5} b = {3, 4, 5, 6, 7, 8} a.union(b) #或者a | b a.intersection(b) #或者a & b c = a.copy() ###set内元素顺序不一致,并不影响set的相等
-
高阶用法
-
优雅化
# string strings = ['a', 'as', 'bat', 'car', 'dove', 'python'] [x.upper() for x in strings if len(x) > 2] #dict loc_mapping = {val : index for index, val in enumerate(strings)} #set unique_lengths = {len(x) for x in strings} set(map(len, strings))
-
嵌套优雅
#list names_of_interest = [] for names in all_data: enough_es = [name for name in names if name.count('e') >= 2] names_of_interest.extend(enough_es) result = [name for names in all_data for name in names if name.count('e') >= 2] #set some_tuples = [(1, 2, 3), (4, 5, 6), (7, 8, 9)] flattened = [x for tup in some_tuples for x in tup]#[1, 2, 3, 4, 5, 6, 7, 8, 9] [[x for x in tup] for tup in some_tuples] #[[1, 2, 3], [4, 5, 6], [7, 8, 9]] #注意区别
-
-
函数
-
基本函数定义
def my_function(x, y, z=1.5): # 采用默认值的 if z > 1: return z * (x + y) else: return z / (x + y) ## 函数的多种调用方式 my_function(5, 6, z=0.7) my_function(3.14, 7, 3.5) my_function(10, 20) my_function(x=5, y=6, z=7) my_function(y=6, x=5, z=7)
-
函数的作用域:全局与本地
def bind_a_variable(): #本地内变量定义为全局,但是不鼓励使用这类变量 global a a = [] bind_a_variable()
-
python函数可以返回多个值
def f(): a = 5 b = 6 c = 7 return a, b, c # return {'a' : a, 'b' : b, 'c' : c} 返回一个字典 a, b, c = f() return_value = f() #return_value就是一个三元元组
-
做一个object
states = [' Alabama ', 'Georgia!', 'Georgia', 'georgia', 'FlOrIda', 'south carolina##', 'West virginia?'] ## 常规操作 import re def clean_strings(strings): result = [] for value in strings: value = value.strip() value = re.sub('[!#?]', '', value) value = value.title() result.append(value) return result ## 把function当成一个object def remove_punctuation(value): return re.sub('[!#?]', '', value) clean_ops = [str.strip, remove_punctuation, str.title] def clean_strings(strings, ops): result = [] for value in strings: for function in ops: value = function(value) result.append(value) return result #clean_strings(states, clean_ops) # map function for x in map(remove_punctuation, states) print(x)
-
lambda函数
def short_function(x): return x * 2 # equiv_anon = lambda x: x * 2 def apply_to_list(some_list, f): return [f(x) for x in some_list] ints = [4, 0, 1, 5, 6] apply_to_list(ints, lambda x: x * 2) strings = ['foo', 'card', 'bar', 'aaaa', 'abab'] strings.sort(key=lambda x: len(set(list(x))))#['aaaa', 'foo', 'abab', 'bar', 'card'] def add_numbers(x, y): return x + y add_five = lambda y: add_numbers(5, y) from functools import partial add_five = partial(add_numbers, 5)
-
生成器
def squares(n=10): print('Generating squares from 1 to {0}'.format(n ** 2)) for i in range(1, n + 1): yield i ** 2 # 优雅:gen = (x ** 2 for x in range(100)) 生成器 # sum(x ** 2 for x in range(100)) # dict((i, i **2) for i in range(5)) gen = squares() for x in gen: print(x, end=' ') import itertools first_letter = lambda x: x[0] names = ['Alan', 'Adam', 'Wes', 'Will', 'Albert', 'Steven'] for letter, names in itertools.groupby(names, first_letter): print(letter, list(names)) # names is a generator ''' A ['Alan', 'Adam'] W ['Wes', 'Will'] A ['Albert'] S ['Steven'] '''
[图片上传失败...(image-e82e5-1579059046663)]
-
报错
def attempt_float(x): try: return float(x) except: # except (TypeError, ValueError): return x # 不论正确与否,都要继续执行 f = open(path, 'w') try: write_to_file(f) finally: f.close() # 整套流程 f = open(path, 'w') try: write_to_file(f) except: print('Failed') else: print('Succeeded') finally: f.close()
-
-
文件操作
path = 'examples/segismundo.txt' f = open(path) #默认为“r” ,read-only打开 #w,创建新文件,如当前路径有再覆盖;x,创建但不覆盖如有就报错 lines = [x.rstrip() for x in open(path)] # rstrip去掉字符串后面的符号(默认为空格) f.close() # 一个简单的方式 with open(path) as f: lines = [x.rstrip() for x in f] #这个会自动关闭f #读取 f = open(path) f.read(10) f.tell() #告知当前读的位置 import sys sys.getdefaultencoding() # 'utf-8' f.seek(3) # 3 f.read(1) #定义到当前位置3,再读第3个数 # 写入到文件,按行写 with open('tmp.txt', 'w') as handle: handle.writelines(x for x in open(path) if len(x) > 1) with open('tmp.txt') as f: lines = f.readlines() # 对文件解码 data b'Sue\xc3\xb1a el ' data.decode('utf8') # data[:4].decode('utf8') 不完整则不行,会报错
[图片上传失败...(image-d399cd-1579059046663)]