自动下载Dive into Python 3网页及其相关链接
# -*- coding=utf-8 -*- import os import urllib import re #1. 下载种子;从指定种子网页开始自动下载,递归下载有效链接 source_link = 'http://woodpecker.org.cn/diveintopython/toc/index.html' f = open(source_link.split("/")[-1], 'w') print 'save to file : ', os.getcwd() page = urllib.urlopen(source_link) page_content = page.read() #找到有效链接 # . 匹配任意除换行符外的字符 # * 匹配前一个字符0次到无限次 # + 匹配前一个字符1次到无限次 # [] 对应位置可以是字符集中的任意字符 # [^] 对应位置不是字符集中的任意字符 m = re.findall(r'<a href="([^"#]+)".*>.*</a>', page_content) mm = [] for mi in m: if (mi not in mm) and (mi.endswith('html')): mm.append(mi) f.write(page_content) f.close() #2. 下载有效链接页面 print "%d pages to be downloaded "%(len(mm)) for mmi in mm: #os.path.dirname("a/b/c.txt") ==> a/b #os.path.basename("a/b/c.txt") ==> c.txt sub_link = os.path.dirname(source_link) + "/" + mmi f = open(sub_link.split("/")[-1], 'w') page = urllib.urlopen(sub_link) page_content = page.read() f.write(page_content) f.close() print '.',