xpath 爬取51job,存于excel

#coding=gb18030
import requests
from bs4 import BeautifulSoup
from lxml import etree
import os
import time
import xlwt
import urllib2
import string

ur=[]
th=[]
zp=u'招聘_'
xl=xlwt.Workbook()
st=xl.add_sheet('job',cell_overwrite_ok=True)

for i in range(1,3):
    url='http://search.51job.com/list/040000,000000,0000,00,9,99,python,2,'+str(i) +'.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
    ur.append(url)

header={'User-Agent':'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)'}

job_link=[]
for u in ur:
    req=urllib2.Request(u,headers=header)
    resp=urllib2.urlopen(req)
    response=resp.read()
    resp.close()
    html=etree.HTML(response)
    result=html.xpath('//a[@οnmοusedοwn=""]/@href')
    for x in result:
        job_link.append(x)

k=0
for job in job_link:
    k+=1
    try:
        req=urllib2.Request(job)
        res=urllib2.urlopen(req)
        respon=res.read()
        res.close()
        sou=etree.HTML(respon)
        st_title=sou.xpath('//title/text()')
        ss=st_title[0].find(u'招聘_')+3
        st.write(k,0,st_title[0][ss:-12].strip())
        st.write(k,1,sou.xpath('//h1/text()')[0])
        t1=sou.xpath('//span[@class="sp4"]//text()')
        tt1=','.join(t1)
        st.write(k,2,tt1)
        st.write(k,3,sou.xpath('//span[@class="lname"]/text()')[0])
        dz=sou.xpath('//div[@class="bmsg inbox"]//text()')
        s=''.join(dz).strip()
        st.write(k,4,s)
        st.write(k,5,sou.xpath('//strong/text()')[1])
        di=sou.xpath('//div[@class="bmsg job_msg inbox"]//text()')
        d=''.join(di).replace('\n','').strip()
        st.write(k,6,d)

    except:
        pass
xl.save('d:\\job_python.xls')

你可能感兴趣的:(xpath 爬取51job,存于excel)