python 爬取表情包——斗图啦

#import urllib
import requests
import time
from lxml import etree
url='http://www.doutula.com/'
headers={'Referer':'http://www.doutula.com/',
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER'}

resp=requests.get(url,headers=headers)
print(resp.text)
'''

总爱在我的生活里指手画脚,俗称经验婊和过来人婊
 '''

#开始解析

#html=etree.HTML(resp.text)
#srcs=html.xpath('.//img/@data-original')
#for src in srcs:
#    filename=src.split('/')[-1]
#    img=requests.get(src,headers=headers)
#    
#    with open('D:\Anaconda3\imgs/'+filename,'wb') as file:
#        file.write(img.content)  
#    print(src,filename)
#    
#print(len(src))



def download_img(src):
    filename=src.split('/')[-1]
    img=requests.get(src,headers=headers)
    with open('D:\Anaconda3\imgs/'+filename,'wb') as file:
        file.write(img.content)  
    print(src,filename)



def get_page(url):
    resp=requests.get(url,headers=headers)
    print(resp,url)
    html=etree.HTML(resp.text)
    srcs=html.xpath('.//img/@data-original')
    for src in srcs:
        download_img(src)
        
    next_link=html.xpath('.//a[@rel="next"]/@href')
    return ['next_link']


next_link_base='http://www.doutula.com/article/list/?page='
next_link=html.xpath('.//a[@rel="next"]/@href')
current_num=1
while next_link:
    time.sleep(0.2)
    current_num+=1
    next_link=get_page(next_link_base+str(current_num))
    if current_num>=4:
        break
        
        

'''
http://www.doutula.com/article/list/?page=581
'''

 

你可能感兴趣的:(python)