Ver beta..代码粗陋。
使用说明以Windows为例, Python版本为2.7.6
cd C:\mechanize-0.2.5 C:\Python27\python setup.py install
cd C:\beautifulsoup4-4.3.2 C:\Python27\python setup.py install
cd C:\picture C:\Python27\python meizitu_spider.py
代码:
#!/usr/local/bin/python # -*-coding=utf-8-*- # Filename: meizitu_spider.py import os import mechanize from bs4 import BeautifulSoup br = mechanize.Browser() br.set_handle_robots(False) br.addheaders = [("User-Agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1")] #br.set_proxies({"http": "proxy.host.com:port"}) def parse_url(url): br.open(url) response = br.response() soup = BeautifulSoup(response.read(), from_encoding='gb18030') return soup def find_next_page(soup): page_nums = soup.find('div', id='wp_page_numbers').find_all('li'); next_page_wrapper = page_nums[-2] return next_page_wrapper.find('a') host = "http://www.meizitu.com/" next_page_uri = '' page_count = 1 parent_folder = 'MeiZiTu' if(not(os.path.exists(parent_folder))): os.mkdir(parent_folder) while True: print 'Start to parse PAGE %d' %page_count soup = parse_url(host + 'a/' + next_page_uri) next_page = find_next_page(soup) if next_page == None: break next_page_uri = next_page.get('href') for pic_link_wrapper in soup.find_all('div', attrs={'class':'metaRight'}): pic_link = pic_link_wrapper.find('a') album_soup = parse_url(pic_link.get('href')) album_name = os.path.join(parent_folder, pic_link.get_text()) if(os.path.exists(album_name)): continue os.mkdir(album_name) for img in album_soup.find('div', id='picture').find_all('img'): img_src = img.get('src') img_name = img_src[img_src.rindex('/')+1:] picture_data = mechanize.urlopen(img_src) with open(os.path.join(album_name, img_name), 'wb') as picture: picture.write(picture_data.read()) page_count += 1