#encoding=utf-8
import urllib2,urllib
class tieba:
def tiebaSpider(self,name,starPage,endPage):
#要爬取得网络地址
url = "https://tieba.baidu.com/f?"
#要爬取得带关键字的地址
url = url+urllib.urlencode({"kw":name})+"&"
#循环起始位置到终止为止
for i in range(starPage,endPage+1):
pageNum = (i-1)*50
fullUrl = url+urllib.urlencode({"pn":pageNum})
html = self.loadPage(fullUrl,i)
self.writePage(html,i)
def loadPage(self,url,pageNum):
header = {"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6"}
print "正在爬取第"+str(pageNum)+"页数据......"
request = urllib2.Request(url,headers=header)
response = urllib2.urlopen(request)
html = response.read()
return html
def writePage(self,html,pageNum):
filename = "di"+str(pageNum)+"ye.html"
print "正在写入"+filename
with open(filename,"w") as file:
file.writelines(html)
if __name__=='__main__':
name = raw_input("请输入要搜索的贴吧名:")
startPage = input("请输入起始页:")
endPage = input("请输入一个终止页:")
tieba = tieba()
tieba.tiebaSpider(name,startPage,endPage)
print "爬取结束!"