爬虫之get和post学习

1.百度搜索

import urllib
url='http://www.baidu.com/s'
word={'wd':'机器学习'}
word=urllib.parse.urlencode(word)
print(word)
newurl=url+'?'+word
headers={'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
         'Cookie':'BAIDUID=F1B94BEF95818D4E217B38962DA35982:FG=1; BIDUPSID=F1B94BEF95818D4E217B38962DA35982; PSTM=1511953622; BD_UPN=12314753; delPer=0; BD_HOME=0; BD_CK_SAM=1; PSINO=5; H_PS_PSSID=1428_26911_21121_26350_22073; H_PS_645EC=9b8bwC3Z4y4mMZvDyp%2FLdeWxRxqoVDMlKNNosJwrnwME3OJKY4DLlkaGiis'
}
request=urllib.request.Request(newurl,headers=headers)
response=urllib.request.urlopen(request)
print(response.read())

2.爬取百度贴吧(html页面)

但主要是json文件所以爬htm文件没有用

# 爬取百度贴吧
import urllib
def tiebaSpider(url,beginPage,endPage):
    for i in range(beginPage,endPage+1):
        pn=(i-1)*50
        filename='第'+str(i)+"页.html"
        fullurl=url+'&ie=utf-8&pn='+str(pn)
        html=loadPage(fullurl,filename)
        writeFile(html,filename)

def loadPage(url,filename):
    print('正在下载:'+filename)
    headers = {
        'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
        'Cookie': 'BAIDUID=F1B94BEF95818D4E217B38962DA35982:FG=1; BIDUPSID=F1B94BEF95818D4E217B38962DA35982; PSTM=1511953622; BD_UPN=12314753; delPer=0; BD_HOME=0; BD_CK_SAM=1; PSINO=5; H_PS_PSSID=1428_26911_21121_26350_22073; H_PS_645EC=9b8bwC3Z4y4mMZvDyp%2FLdeWxRxqoVDMlKNNosJwrnwME3OJKY4DLlkaGiis'
        }
    request = urllib.request.Request(newurl, headers=headers)
    response = urllib.request.urlopen(request)
    return response.read()

def writeFile(html,filename):
    with open(filename,'w',encoding='utf-8') as f:
        f.write(str(html))

# http://tieba.baidu.com/f?kw=%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0&ie=utf-8&pn=0
# http://tieba.baidu.com/f?kw=%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0&ie=utf-8&pn=50
# http://tieba.baidu.com/f?kw=%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0&ie=utf-8&pn=100
if __name__=="__main__":
    # kw=input("需要爬取的贴吧:")
    kw='机器学习'
    # beginPage=int(input("起始页:"))
    beginPage=0
    # endPage=int(input("终止页:"))
    endPage=1
    url='http://tieba.baidu.com/f?'
    kw=urllib.parse.urlencode({'kw':kw})
    newurl=url+kw
    print(newurl)
    tiebaSpider(newurl,beginPage,endPage)

 

你可能感兴趣的:(爬虫之get和post学习)