# -*- coding: utf-8 -*-
import urllib.request
import urllib.parse
import os
from bs4 import BeautifulSoup
import re
def url_open(url):
req=urllib.request.Request(url)
req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36")
response=urllib.request.urlopen(req)
html=response.read()
return html
def get_pagenum(url): #获取jandan网站的页面号(2320)
html = url_open(url).decode("utf-8")
num_re = re.compile(r'\[\d{4}\]')
num = num_re.search(html)
a = re.compile(r'\d{4}')
num_b = a.search(num.group())
return num_b.group()
def get_images(url):
# html=url_open(url).decode("utf-8")
# img_list=[]
# jpg_re=re.compile(r'# numurl=jpg_re.findall(html)
# print (numurl)
# jpg = re.compile(r'//w.+\.jpg')
# for line in numurl:
# imgurl=jpg.findall(line)
# img_list.append(imgurl[0])
# return img_list
html=url_open(url).decode("utf-8")
img_list=[]
jpg_re=re.compile(r')
#当给出的正则表达式中带有一个括号时,列表的元素为字符串,
#此字符串的内容与括号中的正则表达式相对应(不是整个正则表达式的匹配内容)
imgurl=jpg_re.findall(html)
for each in imgurl:
img_list.append(each)
# print (img_list)
return img_list
def save_imgs(img_list):
i=0
for each in img_list:
i+=1
filename=each.split("/")[-1]
with open(filename,"wb") as f:
img=url_open("http://%s" %each)
f.write(img)
print ("下载本页的第%s张图片,名称为%s" %(i,filename))
def download__mm(dir,url):
if not os.path.isdir(dir):
os.mkdir(dir)
os.chdir(dir)
else:
os.chdir(dir)
url=url
page_num=int(get_pagenum(url))
for i in range(20):
page_num -= 1
pageurl = url + "page-" + str(page_num) + "#comments"
imgurl = get_images(pageurl)
print("下载第%s页图片" % page_num)
saveimg = save_imgs(imgurl)
if __name__=="__main__":
dir="PaPa"
url= "http://jandan.net/ooxx/"
download__mm(dir,url)
**但是存在的问题是,使用代理后urllib.error.URLError
[WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。>
# -*- coding: utf-8 -*-
import urllib.request
import urllib.parse
import os
from bs4 import BeautifulSoup
import re
import random
proxies = []
headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"}
def get_proxy():
url="http://www.xicidaili.com"
req=urllib.request.Request(url,headers=headers)
# req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) "
# "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36")
response=urllib.request.urlopen(req)
html=response.read().decode("utf-8")
# IP=re.compile(r"\b(([01]?\d?\d|2[0-4]\d|25[0-5])\.){3}([01]?\d?\d|2[0-4]\d|25[0-5])\b")
IP = re.compile('(\d+)\.(\d+)\.(\d+)\.(\d+) \s*(\d+) ')
proxy_ip=IP.findall(html)
for each in proxy_ip:
proxies.append(":".join([(".".join(each[0:4])),each[4]]))
# print (proxies)
return proxies
def change_proxy():
proxy=random.choice(proxies)
if proxy==None:
proxy_support=urllib.request.ProxyHandler({})
else:
proxy_support = urllib.request.ProxyHandler({"http": proxy})
opener = urllib.request.build_opener(proxy_support)
opener.addheaders=[("User-Agent",headers["User-Agent"])]
urllib.request.install_opener(opener)
print('智能切换代理:%s' % ('本机' if proxy == None else proxy))
def url_open(url):
req=urllib.request.Request(url,headers=headers)
response=urllib.request.urlopen(req)
html=response.read()
return html
def get_pagenum(url): #获取jandan网站的页面号(2320)
html = url_open(url).decode("utf-8")
num_re = re.compile(r'\[\d{4}\]')
num = num_re.search(html)
a = re.compile(r'\d{4}')
num_b = a.search(num.group())
return num_b.group()
def get_images(url):
html=url_open(url).decode("utf-8")
img_list=[]
jpg_re=re.compile(r')
#当给出的正则表达式中带有一个括号时,列表的元素为字符串,
#此字符串的内容与括号中的正则表达式相对应(不是整个正则表达式的匹配内容)
imgurl=jpg_re.findall(html)
for each in imgurl:
img_list.append(each)
# print (img_list)
return img_list
def save_imgs(img_list):
i=0
for each in img_list:
i+=1
filename=each.split("/")[-1]
with open(filename,"wb") as f:
img=url_open("http://%s" %each)
f.write(img)
print ("下载本页的第%s张图片,名称为%s" %(i,filename))
def download__mm(dir,url):
if not os.path.isdir(dir):
os.mkdir(dir)
os.chdir(dir)
else:
os.chdir(dir)
url=url
page_num=int(get_pagenum(url))
for i in range(20):
page_num -= 1
pageurl = url + "page-" + str(page_num) + "#comments"
imgurl = get_images(pageurl)
print("下载第%s页图片" % page_num)
saveimg = save_imgs(imgurl)
if __name__=="__main__":
get_proxy()
change_proxy()
dir="PaPa"
url= "http://jandan.net/ooxx/"
download__mm(dir,url)