python_Spider 学习笔记

 
  
#-*- coding:utf-8 -*- import urllib from urllib import parse,request def writePage(html,filename):
'''
将获取的内容写入文件

''' print( "keep file....") print( type(html)) #这里的html类型是bytes 写文件的时候要设置类型是wb+ with open(filename, "wb+") as f: f.write(html) print( "-"* 30) def loadPage(url, filename):
'''
获取url地址的内容

''' print( "loding.....") print(url) un_headers = { "User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"}
#创建一个url对象 并模拟浏览器 a_request = urllib.request.Request(url, headers=un_headers) 
#获取url地址的内容 return urllib.request.urlopen(a_request).read() def tiebaSpider(url,beginPage,endPage):
'''
根据网页特性拼接url

''' for page in range(beginPage, endPage + 1): pn = (page - 1) * 50 filename = "page"+ str(page)+ ".html" fullurl = url + "&pn=" + str(pn) print( " fullurl [%s]"%fullurl) html = loadPage(fullurl,filename) writePage(html,filename) print( "thanks......") if __name__ == '__main__': kw = input( "请输入需要爬去的贴吧名:") beginPage = int( input( "请输入起始页:")) endPage = int( input( "请输入结束页")) url = "http://tieba.baidu.com/f?" #这里url前需要加http;//否则或报错 raise ValueError("unknown url type: %r" % self.full_url)
    #对要查找的内容进行编码urlencode()参数是字典类型 key = urllib.parse.urlencode({ "kw":kw}) fullurl = url + key print(fullurl) tiebaSpider(fullurl,beginPage,endPage)

你可能感兴趣的:(script)