from urllib import request
import urllib
import time
# 构造请求头信息
header={
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/56.0.2924.90 Safari/537.36 2345Explorer/9.4.2.17629"
}
# url规律
# http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=0 #第一页(1-1)*50
# http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=50 #第二页 (2-1)*50
# http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=100 #第三页(3-1)*50
# http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=150 #第四页(4-1)*50
# for i in range(1,4):
# print("http://tieba.baidu.com/f?kw=python&ie=utf-8&pn="+str((i-1)*50))
def loadpage(fullurl,filename):
print("正在下载:",filename)
req=request.Request(fullurl,headers=header)
resp=request.urlopen(req).read()
return resp
def writepage(html,filename):
print("正在保存:",filename)
with open(filename,"wb") as f:
f.write(html)
print("-----------------------")
# 构造URL
def tiebaspider(url,begin,end):
for page in range(begin,end+1):
pn=(page-1)*50
fullurl=url+"&pn="+str(pn) #每次请求的完整url
filename="c:/第"+str(page)+"页.html" #每次请求后保存的文件名
html=loadpage(fullurl,filename) #调用爬虫爬取网页
writepage(html,filename) #把获取到的网页信息写入本地
if __name__== '__main__':
kw=input("请输入贴吧名:")
begin=int(input("请输入起始页:"))
end=int(input("请输入结束页:"))
url="http://tieba.baidu.com/f?"
key=urllib.parse.urlencode({"kw":kw})
url=url+key
tiebaspider(url,begin,end)
time.sleep(10)