网络爬虫:pyquery

pyquery类似于jquery语法,可以对html文本进行解析:pip3 install pyquery
from pyquery import PyQuery
import requests

pq = PyQuery(html文档)
pq('css'选择器)
items():获取到多个标签,使用items()将PyQuery转化成一个生成器对象,然后使用for in 循环
filter('css选择器'):过滤
text():获取标签的文本
attr('属性名'):获取属性值
'''
def tencentJob(full):
    # 职位列表页源码
    html = load_data(full_url)
    next_url = parse_page_data(html)
    if next_url != 'javascript:;':
        next_url = 'https://hr.tencent.com/'+ next_url
        tencentJob(next_url)

def load_data(url):

    """
    发起请求获取职位列表页源码
    """
    req_header = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3534.4 Safari/537.36'
    }
    response = requests.get(url,headers=req_header)
    if response.status_code == 200:
        return response.text


def parse_page_data(html):
    """
    解析职位列表的源码数据,获取源码数据及职位详情页中的内容,返回职位列表下一页的url
    :param html:
    :return:
    """
    # 实例化pyquery对象
    html_pq = PyQuery(html)
   # 提取职位列表
   #  tr_even = html_pq('tr.even') # 1
    tr_even = html_pq('tr').filter('.even')
    tr_odd = html_pq('tr').filter('.odd')

    tr_all = tr_even + tr_odd
    tr_all = tr_all.items()
    print(type(tr_even),tr_even)
    print(type(tr_odd),tr_odd)
    for tr in tr_all:
        jobinfo = {}
        # 标题(使用.text取出文本)
        jobinfo['title'] = tr('td.l.square a').text()
        print(type(jobinfo))
        # 取出详情页地址a的href属性
        detail_url = 'https://hr.tencent.com/'+tr('td.l.square a').attr('href')
        # 职位类型:获取索引位置的值
        jobinfo['type'] = tr('td').eq(1).text()
        # 人数
        jobinfo['count'] = tr('td').eq(2).text()
        # 地点
        jobinfo['address'] = tr('td').eq(3).text()
        # 发布时间
        jobinfo['publishTime'] = tr('td').eq(4).text()
        # 工作详情内容
        html = load_data(detail_url)
        jobinfo['content'] = parse_detail_data(html)
    # 提取下一页url地址
    next_url = html_pq('a').filter('#next').attr('href')
    return next_url

def parse_detail_data(html):
    """
    解析详情数据
    :param html:
    :return:
    """
    # 实例化pq属性
    html_pq = PyQuery(html)
    # 提取详情内容所在的li标签
    lis = html_pq('ul.squareli li')
    content = []
    for li in lis.items():
        li_text = li.text()
        content.append(li_text)
    return ''.join(content)


if __name__ == '__main__':
    # 设置起始偏移量
    offset = 0
    # 拼接完整url地址
    full_url = 'https://hr.tencent.com/position.php?&start='+ str(offset)
    tencentJob(full_url)

\

你可能感兴趣的:(网络爬虫:pyquery)