一: 入门写法
import urllib.request, urllib.parse
import random
header_list = [{
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36'},
{
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'},
{
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16'}
]
headers = random.choice(header_list)
name = input('请输入贴吧名:')
start = int(input('请输入起始页:'))
end = int(input('请输入结束页:'))
kw = {'kw': name}
kw = urllib.parse.urlencode(kw)
for i in range(start, end + 1):
print(i)
pn = (i - 1) * 50
baseurl = 'https://tieba.baidu.com/f?'
url = baseurl + kw + '&pn=' + str(pn)
req = urllib.request.Request(url, headers=headers)
res = urllib.request.urlopen(req)
html = res.read().decode('utf-8')
filename = '第' + str(i) + '页.html'
with open(filename, 'w', encoding='utf-8') as f:
print('正在爬取第%d页' % i)
f.write(html)
二: 用函数的方法
import random
import urllib.request, urllib.parse
def readPage(url):
header_list = [{
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36'},
{
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'},
{
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16'}
]
headers = random.choice(header_list)
req = urllib.request.Request(url, headers=headers)
res = urllib.request.urlopen(req)
html = res.read().decode('utf-8')
return html
def writePage(filename, html):
with open(filename, 'w', encoding='utf-8') as f:
f.write(html)
print('写入成功')
def main():
name = input('请输入贴吧名:')
start = int(input('请输入起始页:'))
end = int(input('请输入结束页:'))
kw = {'kw': name}
kw = urllib.parse.urlencode(kw)
for i in range(start, end + 1):
pn = (i - 1) * 50
baseurl = 'https://tieba.baidu.com/f?'
url = baseurl + kw + '&pn=' + str(pn)
html = readPage(url)
filename = '第' + str(i) + '页.html'
writePage(filename, html)
if __name__ == '__main__':
main()
三: 类的方法
import urllib.request
import urllib.parse
import random
class BaiduSpider:
def __init__(self):
self.header_list = [{
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36'},
{
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'},
{
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16'}
]
self.headers=random.choice(self.header_list)
self.baseurl = 'https://tieba.baidu.com/f?'
def readPage(self,url):
req = urllib.request.Request(url, headers=self.headers)
res = urllib.request.urlopen(req)
html = res.read().decode('utf-8')
return html
def writePage(self,filename,html):
with open(filename, 'w', encoding='utf-8') as f:
f.write(html)
print('写入成功')
def main(self):
name = input('请输入贴吧名:')
start = int(input('请输入起始页:'))
end = int(input('请输入结束页:'))
kw = {'kw': name}
kw = urllib.parse.urlencode(kw)
for i in range(start, end + 1):
pn = (i - 1) * 50
url = self.baseurl + kw + '&pn=' + str(pn)
html = self.readPage(url)
filename = '第' + str(i) + '页.html'
self.writePage(filename,html)
if __name__ == '__main__':
spider = BaiduSpider()
spider.main()
需要注意的几点问题
- 字典翻译过来后会自动添加=等号
- 类的中函数的元素创建都用self.的形式,调用的时候也是同样
- 注意返回值return的使用和函数对象的传参