#import urllib
import requests
import time
from lxml import etree
url='http://www.doutula.com/'
headers={'Referer':'http://www.doutula.com/',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER'}
resp=requests.get(url,headers=headers)
print(resp.text)
'''
'''
#开始解析
#html=etree.HTML(resp.text)
#srcs=html.xpath('.//img/@data-original')
#for src in srcs:
# filename=src.split('/')[-1]
# img=requests.get(src,headers=headers)
#
# with open('D:\Anaconda3\imgs/'+filename,'wb') as file:
# file.write(img.content)
# print(src,filename)
#
#print(len(src))
def download_img(src):
filename=src.split('/')[-1]
img=requests.get(src,headers=headers)
with open('D:\Anaconda3\imgs/'+filename,'wb') as file:
file.write(img.content)
print(src,filename)
def get_page(url):
resp=requests.get(url,headers=headers)
print(resp,url)
html=etree.HTML(resp.text)
srcs=html.xpath('.//img/@data-original')
for src in srcs:
download_img(src)
next_link=html.xpath('.//a[@rel="next"]/@href')
return ['next_link']
next_link_base='http://www.doutula.com/article/list/?page='
next_link=html.xpath('.//a[@rel="next"]/@href')
current_num=1
while next_link:
time.sleep(0.2)
current_num+=1
next_link=get_page(next_link_base+str(current_num))
if current_num>=4:
break
'''
http://www.doutula.com/article/list/?page=581
'''