爬取7160美女图片

原文链接: http://www.cnblogs.com/php-linux/p/8321709.html
#coding=utf-8

import urllib.request
from bs4 import BeautifulSoup
from urllib import error
import re
ls = ['zhenrenxiu','meinv',"lianglichemo",'rentiyishu','xiaohua']
def validateTitle(title):
rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |'
new_title = re.sub(rstr, "_", title) # 替换为下划线
return new_title

for j in range(1,60000):
url_origin = "http://www.7160.com/xiaohua/"+str(j)
try:
page_obj = urllib.request.urlopen(url_origin)
page_soup = BeautifulSoup(page_obj,'lxml')
total_page_obj = page_soup.find(text=re.compile('共')).string
pattern = re.compile(r'\d+')
match = pattern.search(total_page_obj)

if match == None:
total_page = 0;
else:
total_page = match.group();

for i in range(1,int(total_page)):
if i == 1 :
url = url_origin+"/index.html"
else:
url = url_origin+"/index_"+str(i)+".html"
request = urllib.request.Request(url)
try:
res = urllib.request.urlopen(request)

soup = BeautifulSoup(res,'lxml')
title_obj = soup.find(attrs={"class":"picmainer"})

if title_obj is not None:
print(url)
title = title_obj.h1.string
content = soup.find('img')
src = content.get("src")

file_name = validateTitle(title)+".jpg"
urllib.request.urlretrieve(src, "D://img2/"+file_name)
print(file_name+"保存成功")
except Exception as e:
print("异常"+str(j))
except Exception as e:
print("异常"+str(j))

转载于:https://www.cnblogs.com/php-linux/p/8321709.html

你可能感兴趣的:(爬取7160美女图片)