网络爬虫案例——前程无忧网java岗位

import requests
from lxml import etree

# 取得html
def getHtml(html):
    url=html
    headers={
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36"     
    }
    try:
        response=requests.get(url,headers=headers) # get请求
#         print(response.status_code) # 测试
#         response.encoding="utf-8" #编码
        response.encoding='GBK'
        html=response.text
#         print(html)
        return html
    except ReadTimeout:
        print("time out")
    except ConnectionError:
        print("connection error")
    except RequestException:
        print("request error")
        
def changeString(str):
    a=str.replace(" ","").replace("\n","").replace("\r","")
    return a
    
        
def getInformation(html):
    results=[]
    html=etree.HTML(html,etree.HTMLParser())
    result1=html.xpath('//*[@id="resultList"]/div[*]/p/span/a/text()')
    result2=html.xpath('//*[@id="resultList"]/div[*]/span[1]/a/text()')
    result3=html.xpath('//*[@id="resultList"]/div[*]/span[2]/text()')
    result4=html.xpath('//*[@id="resultList"]/div[*]/span[3]/text()')
    result5=html.xpath('//*[@id="resultList"]/div[*]/span[4]/text()')
    for i in range(len(result1)):
        results.append([changeString(result1[i]),changeString(result2[i]),changeString(result3[i+1]),changeString(result4[i+1]),changeString(result5[i+1])])
    return results
    

def printInformation(data):
    tplt = "{0:^15}\t{1:^20}\t{2:^15}\t{3:^15}\t{4:^15}\t"
    print(tplt.format("职位名","公司名","工作地点","薪资","发布时间",chr(12288)))
    for i in data:
        print(tplt.format(i[0],i[1],i[2],i[3],i[4],chr(12288)))

 # 保存数据
def store(a):
    with open("./hahaha.txt","w+",encoding="utf-8") as file:
        tplt = "{0:^15}\t{1:^15}\t{2:^15}\t{3:^15}\t{4:^15}\t"
        file.write(tplt.format("职位名","公司名","工作地点","薪资","发布时间",chr(12288))+"\n")
        for i in a:
            file.write(tplt.format(i[0],i[1],i[2],i[3],i[4],chr(12288))+"\n")
       
    return None

def main():
    url="https://search.51job.com/list/120000,000000,0000,32,9,99,Java%25E5%25BC%2580%25E5%258F%2591,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
    html=getHtml(url)
    results=getInformation(html)
    printInformation(results)
    store(results)
    print("OK")
main()

你可能感兴趣的:(网络爬虫)