初学python3爬虫 (一)

初学python3爬虫 (一)_第1张图片
屏幕截图.png
初学python3爬虫 (一)_第2张图片
屏幕截图2.png

需要引入的模块

#!/usr/bin/env python3
#-*-encoding:utf-8-*-

from urllib import request,parse
import re
import os
import time

url请求

def open_url(url):
 str1 = request.Request(url)
 response = request.urlopen(str1)
 html = response.read().decode('gbk')
 return html

获取详情页url

def get_urls():
 url = 'http://www.mm131.com/'
 html = open_url(url)
 url_pattern = re.compile(r'')
 item1 = re.findall(url_pattern,html)
 return item1

保存

def save(urls):
 for each in list(range(len(urls))):
 url = 'http://www.mm131.com/'+urls[each]
 detail_html = open_url(url)
 time.sleep(5)
 main_img_pattern = re.compile(r'(.*?)共(.*?)页')
 page_total = re.findall(page_pattern,detail_html)
 title_pattern = re.compile(r'
(.*?)<\h5>') #标题名 titles = re.findall(title_pattern,detail_html) #创建文件夹 if os.path.exists('/home/yzw/mm131/'+titles[0]):#是否存在 os.chdir(titles[0]) else: try: os.mkdir('/home/yzw/mm131/'+titles[0]) os.chdir('/home/yzw/mm131/'+titles[0]) except: continue for i in range(int(page_total[0])): if i==0: next_url = url else: num = i+1 a = urls[each][0:-5] next_url = 'http://www.mm131.com/'+str(a)+'_'+str(num)+'.html' every_html = open_url(next_url) every_img_pattern = re.compile(r'(.*?)

开始抓取

urls = get_urls()
title = 'mm131'
os.mkdir(title)
os.chdir(title)
save(urls)

抓取结果

.
.
.
[('纹身小妹夏美酱酥胸半露诱惑十足(图45)', 'http://img1.mm131.com/pic/2274/45.jpg')]
[('纹身小妹夏美酱酥胸半露诱惑十足(图46)', 'http://img1.mm131.com/pic/2274/46.jpg')]
[('纹身小妹夏美酱酥胸半露诱惑十足(图47)', 'http://img1.mm131.com/pic/2274/47.jpg')]
[('纹身小妹夏美酱酥胸半露诱惑十足(图48)', 'http://img1.mm131.com/pic/2274/48.jpg')]
[('纹身小妹夏美酱酥胸半露诱惑十足(图49)', 'http://img1.mm131.com/pic/2274/49.jpg')]
[('纹身小妹夏美酱酥胸半露诱惑十足(图50)', 'http://img1.mm131.com/pic/2274/50.jpg')]
[('纹身小妹夏美酱酥胸半露诱惑十足(图51)', 'http://img1.mm131.com/pic/2274/51.jpg')]
[('美护士沈梦瑶制服写真大胆张腿很诱惑(图1)', 'http://img1.mm131.com/pic/2746/1.jpg')]
[('美护士沈梦瑶制服写真大胆张腿很诱惑(图2)', 'http://img1.mm131.com/pic/2746/2.jpg')]
[('美护士沈梦瑶制服写真大胆张腿很诱惑(图3)', 'http://img1.mm131.com/pic/2746/3.jpg')]
[('美护士沈梦瑶制服写真大胆张腿很诱惑(图4)', 'http://img1.mm131.com/pic/2746/4.jpg')]
.
.
.

你可能感兴趣的:(初学python3爬虫 (一))