import re
from urllib import urlretrieve
from selenium import webdriver
import time
driver = webdriver.PhantomJS(executable_path="C:/Users/lance/Downloads/phantomjs-2.1.1-windows/bin/phantomjs")
def getHtml(url):
driver.get(url)
time.sleep(15)
html = driver.page_source.encode('utf-8', 'ignore')
driver.get_screenshot_as_file(url+".png")
print "Success To Create the screenshot & gather html"
return html
def getImgList(html):
reg = r'src="(http://imgsrc.baidu.com/.*?\.jpg)"'
imgre = re.compile(reg)
htmld = html.decode('utf-8')
imglist = imgre.findall(htmld)
return imglist
def imgDownload(imglist,i):
x=0
for imgurl in imglist:
print(imgurl)
urlretrieve(imgurl,'E:/spider/beautiful/%s%s.jpg' % (x,i))
x+=1
url = 'http://tieba.baidu.com/p/2173159072#!/l/p'
if __name__ == '__main__':
for i in range(1,7):
setUrl = url+str(i)
print(setUrl)
html = getHtml(setUrl)
imgList = getImgList(html)
print imgList
imgDownload(imgList,str(i))
driver.close()