#selenium 爬取javascript编写的网页 #使用xpath #爬取职位名称以及基本介绍
直接上代码!!!
from selenium import webdriver
from lxml import etree
# 创建tecent类
class tencent(object):
# 构造函数带参数url
def __init__(self, url):
self.url = url
# 保存文件为doc格式
def save(self, content):
with open('tencent.doc', 'a+', encoding='utf-8')as file:
file.write(content+'\n')
# 主程序编写
def driver(self):
driver = webdriver.PhantomJS()#自动驱动PhantonJS获取源代码
driver.get(url)
response = driver.page_source#获取原码
items = etree.HTML(response)#xapth提取数据
tables = items.xpath('//div[@class="recruit-list"]')
for table in tables:
title = table.xpath('.//a/h4/text()')[0]#提取岗位名
info = table.xpath('.//a/p[2]/text()')[0]#提取简介信息
information = [title, info]
content = '\n'.join(information)#列表转成字符串
self.save(content)#调用保存函数
print(title)#控制台显示
print(info)
if __name__ == '__main__':
for i in range(1, 417): #翻页构造url
url = 'https://careers.tencent.com/search.html?&index=%s' % str(i)
main = tencent(url) #创建类对象main
main.driver() #调用类函数
print('***'*50) #控制台没输出一页信息打印50组***
爬取结果截图::