pythoncookbook 第4章生成器与迭代器

[toc]

4 迭代器与生成器

for xxx in xxx理解

__iter__几乎是for xx in xx 此设计的

for xxx in xxx在迭代器中应用

调用iter()方法,返回一个对象 itmes = iter(object)
调用此对象 items.next(), 捕捉到StopIteration后结束

########for xxx in xxx理解
###########1
it = iter(lines) # it = lines.__iter__()
    while True:
        try:
            next(it)
        except StopIteration:
            print 'finish iteration'
            break
################2
class ABC(object):
    def __iter__(self):
        return ABC2()

class ABC2(object):
    num = 100
    def next(self):
        self.num -= 1
        if self.num < 0:
            raise StopIteration()
        return self.num

for i in ABC():
    print i

for xxx in xxx在生成器
运行生成器内部代码，将yield的逐个值取出

####for xxx in xxx理解
def abc(x):
    print 123
    yield x
 
for i in abc('hello world')
    print i
#abc('hello world')内置__iter__,next,send,throw方法
 
##类的实现
class ABC(object):
    def __iter__(self):
        return self.abc('hello world')
    def abc(self, x):
        print 123
        yield x
 
for i in ABC():
    print i
#和下面的等同
items = iter(ABC())
while True:
    try:
        print items.next()
    except StopIteration:
        break

迭代器

在python中实现了__iter__方法的对象可以迭代的,即可以调用内置函数iter()方法,return对象.
实现了next()方法的对象实迭代器,得到iter()返回的对象,不断调用next()方法

class Fib(object):
    def __init__(self):
        self.a, self.b = 0, 1
 
    def __iter__(self):
        return self
 
    def next(self):
        self.a, self.b = self.b, self.a + self.b
        if self.a > 100:
            raise StopIteration()
        return self.a
 
fetch = iter(Fib()) #获取用于迭代的对象
while True:
    #不断调用next(),捕捉到StopIteration结束循环
    try:
        i = fetch.next()
        print i
    except StopIteration:
        break
 
#等同for xxx in XXX      
for i in Fib():
    print i

生成器

生成器是对象.保留栈帧的上下文

def abc(x):
    print 123
    yield x
abc("hello world")  # 此时并不会打印出123, 这一步执行产生一个生成器对象
abc("hello world").next() #此时才去执行生成器内的代码

4.4 实现深度优先的遍历树形节点的生成器

for 循环的理解
yield 其他对象的 yield

class Node(object):
    def __init__(self, value):
        self._value = value
        self._children =[]
 
    def __repr__(self):
        return "Node{!r}".format(self._value)
 
    def __iter__(self):
        return iter(self._children)
 
    def add_child(self, node):
        return self._children.append(node)
 
    def depth_first(self):
    """
    yield 相于return, 再次调用的时候,从原先的地方执行
    将其他的对象的yield后 再次yield出来,分为两次
    """
        yield self
        for c in self:
            #下面的代码相当于yield from c.depth_first()
            for items in c.depth_first():
                yield items
 
if __name__ == '__main__':
 
    root = Node(0)
    child1 = Node(1)
    child2 = Node(2)
 
    child11 = Node(11)
    child21 = Node(21)
 
    root.add_child(child1)
    root.add_child(child2)
 
    child1.add_child(child11)
    child2.add_child(child21)
 
    child11.add_child(Node(100))
    child21.add_child(Node(200))
    child21.add_child(Node(201))
 
"""
对于for循环的理解:
ch 为形参,接受 root.depth_first() 返还的实参(相当于return)
root.depth_first()必须实现迭代协议,可以为生成器
"""
    for ch in root.depth_first():
        print ch

depth_first() 方法首先返回(yield)本身并迭代每一个节点的depth_first()方法,并返回(yield)对应元素

传统方法的实现，缺点繁琐

class Node2(object):
    def __init__(self, value):
        self._value = value
        self._children = []
 
    def __repr__(self):
        return "Node{!r}".format(self._value)
 
    def __iter__(self):
        return iter(self._children)
 
    def add_child(self, node):
        return self._children.append(node)
 
    def depth_first(self):
        return DepthFirstIterator(self)
 
 
class DepthFirstIterator(object):
    def __init__(self, start_node):
        self._node = start_node
        self._children_iter = None
        self._child_iter = None
 
    def __iter__(self):
        return self
 
    def __next__(self):
        if self._children_iter is None:
            self._children_iter = iter(self._node)
            return self._node
 
        elif self._child_iter:
            try:
                nextchild = next(self._child_iter)
                return nextchild
            except StopIteration:
                self._child_iter = None
                return next(self)
 
        else:
            self._child_iter = next(self._children_iter).depth_first()
            return next(self)

4.5 反向迭代

采用内置的函数 reversed()
必须实现内置的reversed()方法

class Countdown(object):
    def __init__(self, start):
        self.start = start
 
    def __iter__(self):
        n = self.start
        while n > 0:
            yield n
            n -= 1
 
    def __reversed__(self):
        n = 1
        while n <= self.start:
            yield n
            n += 1
 
 
if __name__ == '__main__':
    for rr in reversed(Countdown(30)):
        print rr
    for rr in Countdown(30):
        print rr

4.6 带有外部参数生成器函数

from collections import deque
 
class LineHistory:
    def __init__(self, lines, hislen=3):
        self.lines = lines
        self.history = deque(maxlen=hislen)
 
    def __iter__(self):
        for lineno, line in enumerate(self.lines, 1):
            self.history.append((lineno, line))
            yield line
 
    def clear(self):
        self.history.clear()

4.7 迭代器切片

得到迭代器生成的切片对象

import itertools
def count(n):
    while True:
        yield n
        n += 1
c = count(0)
# c[10:20]  >>>TypeError: 'generator' object has no attribute '__getitem__'
for items in itertools.islice(c, 10, 21):
    print items

函数 islice() 返回一个可以生成指定元素的迭代器,它通过遍
历并丢弃直到切片开始索引位置的所有元素。然后才开始一个个的返回元素,并直到切片结束索引位置。缺点不能重复使用迭代器里面的数据

4.8 跳过不需要的迭代部分

？？？跳过一个可迭代对象的开始部分，对后面的不影响？
创建一个迭代器，
只要函数predicate(item)为True，就丢弃iterable中的项，
如果predicate返回False，就会生成iterable中的项和所有后续项。

from itertools import dropwhile
with open('manage.py') as f:
    for line in dropwhile(lambda line: line.startwith("#"), f):
        print line

4.9 排列组合实现

比如排列A23 ，组合 C23等

from itertools import permutations，combinations， combinations_with_replacement
items = ['a', 'b', 'c']
for c in permutations(items) # 排列A33
for c in permutations(items, 2) # 排列A33
for c in combinations(items, 3) # 组合 C23
for c in combinations_with_replacement(items, 3) # 同一元素重复使用 3*3*3

4.10 序列上索引迭代

my_list = ['a', 'b', 'c']
for idx, val in enumerate(my_list, 1):
print(idx, val)

这种情况在你遍历文件时想在错误消息中使用行号定位时候非常有用:

def parse_data(filename):
    with open(filename, 'rt') as f:
        for lineno, line in enumerate(f, 1):
            fields = line.split()
            try:
                count = int(fields[1])
                ...
            except ValueError as e:
                print('Line {}: Parse error: {}'.format(lineno, e))

data = [ (1, 2), (3, 4), (5, 6), (7, 8) ]
for n, (x, y) in enumerate(data):

4.11 迭代多个序列 zip()

zip() 会创建一个迭代器来作为结果返回

基本用法压缩

a = [1, 2, 3]
b = ['w', 'x', 'y', 'z']
for i in zip(a,b):
    print(i)
>>>(1,'w')
>>>(2,'x')
>>>(3,'y')
 
from itertools import zip_longest
for i in zip_longest(a,b,fillvalue=None):
print(i)
>>>(1, 'w')
>>>(2, 'x')
>>>(3, 'y')
>>>(None, 'z')

打包字典，变为列表

headers = ['name', 'shares', 'price']
values = ['ACME', 100, 490.1]
s = dict(zip(headers,values))
 
list(zip(headers, values))

zip() 可以接受多于两个的序列的参数 zip(a, b, c)

4.12 不同集合上元素的迭代 chain()

from itertools import chain
a = [1, 2, 3, 4]
b = ['x', 'y', 'z']
for x in chain(a, b):
    print(x)

a，b可以为不同的类型 chain（set，list）甚至是chain（dict，list）

# Inefficent
for x in a + b:
# Better
for x in chain(a, b):

第一种方案中, a + b 操作会创建一个全新的序列并要求 a 和 b 的类型一致
chian() 不会有这一步,所以如果输入序列非常大的时候会很省内存。并且当可迭代
对象类型不一样的时候 chain() 同样可以很好的工作。

4.13 创建数据管道

os.walk 从文件夹某个位置开始遍历

# x为当前的目录 y为当前目录下包含的文件夹 z 为当前目录下的文件
for x, y, z, in os.walk(r"D:\Workspace\sell"):
    for zpieces in z :
        print '{}{}'.format(x,zpieces)

fnmatch.filter(filellist, filepat)
filelist为list则返回符合filepart的文件
filelist为str 则返回布尔值

# encoding:utf-8
import os
import fnmatch
import gzip
import bz2
import re
 
 
def gen_find(filepat, top):
    """
    根据filepat的文件类型,查找当前目录下的文件
    """
    for path, dirlist, filelist in os.walk(top):
            # 过滤符合格式的地址并返回
        for name in fnmatch.filter(filelist, filepat):
            yield os.path.join(path, name)  # 文件的绝对地址的生成器
 
 
def gen_opener(filenames):
    """
    打开文件,yield文件,并关闭
    """
    for filename in filenames:  # 从生成器中取出绝对地址 filename为地址  filenames为含地址的生成器
        if filename.endswith('.gz'):
            f = gzip.open(filename, 'rt')
        elif filename.endswith('.bz2'):
            f = bz2.open(filename,"rt")
            ##todo 可能有问题
        else:
            f = open(filename, "r") 
        yield f  # 文件对象的生成器 
        f.close()
 
 
def gen_concatenate(iterators):
    for it in iterators: #it为文件对象，iterators是文件对象生成器
        for items in it: # items 句子 it文件对象
            yield items # 抛出句子生成器 在外部用for xx in xx得到
 
 
def gen_grep(pattern, lines):
    """
    匹配文中的语句
    """
    pat = re.compile(pattern)
    for line in lines:
        if pat.search(line):
            yield line
 
 
lognames = gen_find("*.py", r"D:\Workspace\sell")
files = gen_opener(lognames)
lines = gen_concatenate(files)
 
pylines = gen_grep(r'^class ', lines)  # 打印类名
for line in pylines:
    print line
#todo 不太懂
#bytecolumn = (line.rsplit(None,1)[1] for line in pylines)
#bytes = (int(x) for x in bytecolumn if x != '-')
#print('Total', sum(bytes))

看不懂嵌套的生成器，请看下面的例子

def gen1 ():
    for i in [[1,2,3,4,5],[6,7,8,9,0]]:
        yield i
def gen2 (i):
    for j in i:
        for k in j:
            yield k
g1 = gen1()
g2 = gen2(g1)
for x in g2:
    print x

不太靠谱的理解，for xx in xx 可以解开生成器，要想得到生成器里的内容，for xx in xx 层数大于生成器嵌套的层数。

4.14 递归生成器展开嵌套的序列

原代码采用 yield from 实现python2 不支持可用 for i in xx ： yield i 代替

# encoding:utf-8
from collections import Iterable
def flatten(items, ignore_types=(str, bytes)):
    for x in items:
    #isinstance(x, Iterable) 判断是否可以迭代 ，可以则继续递归
    #not isinstance(x, ignore_types)，排除字符串，字节，这两者也可以迭代
        if isinstance(x, Iterable) and not isinstance(x, ignore_types):
            for i in flatten(x):
                yield i
        else:
            yield x
items1 = [1, 2, [3, 4, [5, 6], 7], 8]
items2 = ['Dave', 'Paula', ['Thomas', 'Lewis']]
l1 = [x for x in flatten(items1)]
l2 = [x for x in flatten(items2)]

4.15 有序对象合并再排序

heapq.merge()
heapq.merge 生成器迭代特性意味着它不会立马读取所有序列。这就意味着你可以在非
常长的序列中使用它,而不会有太大的开销

import heapq
a = [1, 4, 7, 10]
b = [2, 5, 6, 11]
l = [x for x in heapq.merge(a, b)] ##heapq.merge(a, b)是生成器
>>>[1, 2, 4, 5, 6, 7, 10, 11]

4.16 迭代器代替while循环

其实就是用遍历代替while.
途径:iter(functiong, status)能够迭代,具体参考本节末尾
常见的IO程序,伪代码

CHUNKSIZE = 8192
def reader(s):
    while True:
        data = s.recv(CHUNKSIZE)
        if data == b'':
            break
        process_data(data)
 
f = open("views.py", "r")
reader(f)
 
#用iter()循环代替
def reader2(s):
    for chunk in iter(lambda : s.recv(CHUNKSIZE),b""):
        pass
        #process_data(data)

实例代码

import sys
f = open("views.py","r")
for chunk in iter(lambda: f.read(10), ""):
    n = sys.stdout.write(chunk)

iter()内置函数：
单参数时Iter(func),fun对象支持迭代协议，不然报错
两个参数时Iter(func，arg)，它接受一个可选的 callable 对象和一个标记 (结
尾) 值作为输入参数，不断调用next(),func返回值和标记一样时，抛出StopIteration

x = 0
def func():
    global x
    x +=1
    print x
    return x

while True:
    i = iter(func,100)
    try:
        i.next() #renturn值为100的时候抛出StopIteration
    except StopIteration:
        print '停止迭代'
        break

本章总结

迭代器
迭代器协议几乎是为 for xx inxx设计的,
什么是迭代器呢？遵循__iter__,next()这两个协议的对象。即为__iter__指向迭代的某个对象，这个对象有next()方法。不断的调用next
()，抛出Stopiteration迭代结束。
生成器
涉及太多不在本章展开

pythoncookbook 第4章 生成器与迭代器