利用Python进行分析-Chapter 3

利用Python进行分析-Chapter 3

Python内置数据类型结构:Tuple、List、dict、set

  1. Tuple(元组)

    • 元组的初始化

      tup = 4, 5, 6
      nested_tup = (4, 5, 6), (7, 8)
      
    • List、string转化为Tuple

      tuple([4, 0, 2])
      tup = tuple('string')
      
    • 支持索引访问

      tup[0]
      
    • 一旦被定义后就无法修改其中元素

      tup = tuple(['foo', [1, 2], True])
      tup[2] = False (会报错)
      
    • 如果其中某个元素是一个列表,那么其中那个列表的元素可以修改

      tup[1].append(3)  -> ('foo', [1, 2, 3], True)
      
    • Tuple支持连接

      (4, None, 'foo') + (6, 0) + ('bar',)  -> (4, None, 'foo', 6, 0, 'bar')
      
    • Tunple支持乘法操作,其实际效果是重复相加

      ('foo', 'bar') * 4 -> ('foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'bar')
      
    • Tunple支持解压操作

      tup = (4, 5, 6)
      a, b, c = tup   #a, b, c就对应Tunple的三个元素
      b, a = a, b    #所以Tunple下值的交换是很方便的,如此a,b即完成了值的交换
      seq = [(1, 2, 3), (4, 5, 6), (7, 8, 9)]
      for a, b, c in seq:
         print('a={0}, b={1}, c={2}'.format(a, b, c))   #a,b,c就对应列表中某一元素的三个值
      
    • Tunple支持高级解压缩

      values = 1, 2, 3, 4, 5
      a, b, *rest = values  # rest=[3,4,5] 意味着可以支持取出任意长度的后续字符
      
    • Tunle不支持修改,因而支持众多的查询操作

      a = (1, 2, 2, 2, 3, 4, 2)
      a.count(2) = 4
      
  1. List(列表)

    • List定义

      a_list = [2, 3, 7, None]
      
    • tuple转化为list

      tup = ('foo', 'bar', 'baz')
      b_list = list(tup)
      
    • list支持修改元素

      b_list[1] = 'peekaboo'
      
    • list可以取出迭代器元素

      gen = range(10)
      list(gen) = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
      
    • 增删元素

      b_list.append('dwarf') #在末尾添加
      b_list.insert(1, 'red') #在指定位置添加(插队),被顶的那个元素往后排一下
      insert比较耗时,如果你需要在头尾都插入元素,则要考虑collections.deque和double-ended queue
      b_list.pop(2) #删除第二个位置的元素
      b_list.remove('foo')  #删除值为“foo”的元素
      
    • 判断元素是否在list中

      'dwarf' in b_list / 'dwarf' not in b_list
      
    • List相连接

      [4, None, 'foo'] + [7, 8, (2, 3)]
      x = [4, None, 'foo'] 
      x.extend([7, 8, (2, 3)])  # extend函数也能起到连接的作用,extend函数要花费更少的时间
      everything = []
      for chunk in list_of_lists:
         everything.extend(chunk)
      
    • List内部排序

      a = [7, 2, 5, 1, 3]
      a.sort()  #a被排序,sort中有一个参数,如sort(key=len),即按照字符串的长度排序
      import bisect
      bisect.bisect(a,2)  #对**已排序的列表**插入一个元素,返回的是插入的位置,并不真正插入
      bisect.insortb(a,2)  #真正插入一个值
      
    • List切片

      seq = [7, 2, 3, 7, 5, 6, 0, 1]
      seq[1:5] = [2, 3, 7, 5]   #前闭后开的取值
      seq[3:4] = [6, 3] #通过切片集中修改某些位置的值
      seq[:5] seq[3:]  #列表首尾皆可忽
      seq[-6:-2] #可以符号反向切片,还是左开右闭
      seq[::2] #还可以指定步伐。前进两步从头选到尾
      seq[::-1]  #如此便可以反转整个列表,是非常聪明的方式**
      
  1. 内置的sequence方案

    • enumerate(枚举)

      for i,value in enumerate(a):
          print(i,value)     
      

      enumerate中mapping的妙用

      some_list = ['foo', 'bar', 'baz']
      mapping = {}
      for i, v in enumerate(some_list):
         mapping[v] = i
      
    • sorted(排序,支持排序参数,返回的是一个新的列表)

      a= [2,6,5,4,7]
      sorted(a) #a不变,输出是排序后的
      sorted('horse race') #对字符的ASCII码进行排序
      
    • zip(打包功能,看代码)

      seq1 = ['foo', 'bar', 'baz']
      seq2 = ['one', 'two', 'three']
      zipped = zip(seq1, seq2)
      list(zipped) #[('foo', 'one'), ('bar', 'two'), ('baz', 'three')]
      seq3 = [False, True]
      list(zip(seq1, seq2, seq3)) #[('foo', 'one', False), ('bar', 'two', True)]
      #结果取决于最短的那个seq
      
      #### 与enumerate结合
      for i, (a, b) in enumerate(zip(seq1, seq2)):
          print('{0}: {1}, {2}'.format(i, a, b))
          
      #### 巧用zip实现unzip
      pitchers = [('Nolan', 'Ryan'), ('Roger', 'Clemens')]
      first_names, last_names = zip(*pitchers) #first_names =('Nolan', 'Roger')
      
    • 反转sequence

      list(reversed(range(10)))  #[9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
      
  2. dict(字典)

    • 初始化

      empty_dict = {}
      d1 = {'a' : 'some value', 'b' : [1, 2, 3, 4]}
      d1[7] = 'an integer'
      
    • 判断是否在dict

      'b' in d1
      
    • 删除dict元素

      del d1[5]  #带一个key
      d1.pop('dummy') #返回的是一个键值,d1中删除了这个键值
      
    • 更新dict内的键值,主要是值

      list(d1.keys())  list(d1.values())  #key与value的顺序一致的
      d1.update({'b' : 'foo', 'c' : 12})  #
      
    • 可以使用二元tunple来指定一个dict

      mapping = dict(zip(range(5), reversed(range(5))))
      
    • 设置默认值

      value = some_dict.get(key, default_value)
      for word in words:
         letter = word[0]
         by_letter.setdefault(letter, []).append(word)
          #{'a': ['apple', 'atom'], 'b': ['bat', 'bar', 'book']}
      from collections import defaultdict
      by_letter = defaultdict(list)
      for word in words:
      by_letter[word[0]].append(word)
      
    • 散列性

      hash('string')
      hash((1, 2, (2, 3)))
      hash((1, 2, [2, 3]))  # wrong 
      d[tuple([1, 2, 3])] = 5  # {(1, 2, 3): 5}  可hash
      
  3. set(没有值的字典)

    set([2, 2, 2, 1, 3, 3]) #{1, 2, 3}
    a = {1, 2, 3, 4, 5}
    b = {3, 4, 5, 6, 7, 8}
    a.union(b) #或者a | b
    a.intersection(b) #或者a & b
    c = a.copy()
    
    ###set内元素顺序不一致,并不影响set的相等
    
  4. 高阶用法

    • 优雅化

      # string
      strings = ['a', 'as', 'bat', 'car', 'dove', 'python']
      [x.upper() for x in strings if len(x) > 2]
      
      #dict
      loc_mapping = {val : index for index, val in enumerate(strings)}
      
      #set 
      unique_lengths = {len(x) for x in strings}
      set(map(len, strings))
      
    • 嵌套优雅

      #list
      names_of_interest = []
      for names in all_data:
      enough_es = [name for name in names if name.count('e') >= 2]
      names_of_interest.extend(enough_es)
      result = [name for names in all_data for name in names
                if name.count('e') >= 2]
      
      #set
      some_tuples = [(1, 2, 3), (4, 5, 6), (7, 8, 9)]
      flattened = [x for tup in some_tuples for x in tup]#[1, 2, 3, 4, 5, 6, 7, 8, 9]
      [[x for x in tup] for tup in some_tuples] #[[1, 2, 3], [4, 5, 6], [7, 8, 9]]
      #注意区别
      
  5. 函数

    • 基本函数定义

      def my_function(x, y, z=1.5):  # 采用默认值的
         if z > 1:
             return z * (x + y)
         else:
             return z / (x + y)
      ## 函数的多种调用方式
      my_function(5, 6, z=0.7)
      my_function(3.14, 7, 3.5)
      my_function(10, 20)
      my_function(x=5, y=6, z=7)
      my_function(y=6, x=5, z=7)
      
    • 函数的作用域:全局与本地

      def bind_a_variable():   #本地内变量定义为全局,但是不鼓励使用这类变量
          global a
          a = []
      bind_a_variable()
      
    • python函数可以返回多个值

      def f():
         a = 5
         b = 6
         c = 7
         return a, b, c
         #  return {'a' : a, 'b' : b, 'c' : c}  返回一个字典
      a, b, c = f()
      return_value = f()  #return_value就是一个三元元组
      
    • 做一个object

      states = [' Alabama ', 'Georgia!', 'Georgia', 'georgia', 'FlOrIda',
                'south carolina##', 'West virginia?']
      
      ##  常规操作
      import re
      def clean_strings(strings):
         result = []
         for value in strings:
             value = value.strip()
             value = re.sub('[!#?]', '', value)
             value = value.title()
             result.append(value)
         return result
      
      ## 把function当成一个object
      def remove_punctuation(value):
         return re.sub('[!#?]', '', value)
      clean_ops = [str.strip, remove_punctuation, str.title]
      def clean_strings(strings, ops):
         result = []
         for value in strings:
             for function in ops:
                  value = function(value)
             result.append(value)
         return result    #clean_strings(states, clean_ops)
      
      # map function
      for x in map(remove_punctuation, states)
         print(x)
      
    • lambda函数

      def short_function(x):
         return x * 2          # equiv_anon = lambda x: x * 2
      
      def apply_to_list(some_list, f):
         return [f(x) for x in some_list]
      ints = [4, 0, 1, 5, 6]
      apply_to_list(ints, lambda x: x * 2)
      
      strings = ['foo', 'card', 'bar', 'aaaa', 'abab']
      strings.sort(key=lambda x: len(set(list(x))))#['aaaa', 'foo', 'abab', 'bar', 'card']
      
      def add_numbers(x, y):
         return x + y
      add_five = lambda y: add_numbers(5, y)
      
      from functools import partial
      add_five = partial(add_numbers, 5)
      
    • 生成器

      def squares(n=10):
         print('Generating squares from 1 to {0}'.format(n ** 2))
         for i in range(1, n + 1):
             yield i ** 2  # 优雅:gen = (x ** 2 for x in range(100)) 生成器
                           # sum(x ** 2 for x in range(100))
                           # dict((i, i **2) for i in range(5))
      gen = squares()
      for x in gen:
          print(x, end=' ')
          
      import itertools
      first_letter = lambda x: x[0]
      names = ['Alan', 'Adam', 'Wes', 'Will', 'Albert', 'Steven']
      for letter, names in itertools.groupby(names, first_letter):
          print(letter, list(names)) # names is a generator
      '''
      A ['Alan', 'Adam']
      W ['Wes', 'Will']
      A ['Albert']
      S ['Steven']
      '''
      

      [图片上传失败...(image-e82e5-1579059046663)]

    • 报错

      def attempt_float(x):
         try:
             return float(x)
         except:   # except (TypeError, ValueError):
             return x 
      # 不论正确与否,都要继续执行
      f = open(path, 'w')    
      try:
         write_to_file(f)
      finally:
         f.close()
      # 整套流程
      f = open(path, 'w')
      try:
         write_to_file(f)
      except:
         print('Failed')
      else:
         print('Succeeded')
      finally:
         f.close()
      
  1. 文件操作

    path = 'examples/segismundo.txt'
    f = open(path) #默认为“r” ,read-only打开
    #w,创建新文件,如当前路径有再覆盖;x,创建但不覆盖如有就报错
    lines = [x.rstrip() for x in open(path)] # rstrip去掉字符串后面的符号(默认为空格)
    f.close()
    # 一个简单的方式
    with open(path) as f:
        lines = [x.rstrip() for x in f]  #这个会自动关闭f
    #读取
    f = open(path)
    f.read(10)
    f.tell() #告知当前读的位置
    import sys
    sys.getdefaultencoding() # 'utf-8'
    f.seek(3) # 3
    f.read(1) #定义到当前位置3,再读第3个数
    # 写入到文件,按行写
    with open('tmp.txt', 'w') as handle:
        handle.writelines(x for x in open(path) if len(x) > 1)
    with open('tmp.txt') as f:
        lines = f.readlines()
    # 对文件解码
    data b'Sue\xc3\xb1a el '
    data.decode('utf8') # data[:4].decode('utf8') 不完整则不行,会报错
    
    image

    [图片上传失败...(image-d399cd-1579059046663)]

你可能感兴趣的:(利用Python进行分析-Chapter 3)