ptyhon+mongodb 爬广西人才网(gxrc.com)

gxrc

pip3 install requests
pip3 install BeautifulSoup
pip3 install pymongo

#打开mongodb服务器
mongod
import requests
from bs4 import BeautifulSoup
import json
import time
import pymongo
from bson.json_util import dumps,loads,RELAXED_JSON_OPTIONS

def conn(u="mongodb://localhost:27017/"):
    myclient = pymongo.MongoClient(u)
    #dblist = myclient.list_database_names()
    mydb = myclient["gxrc"]
    #collist = mydb. list_collection_names()
    mycol = mydb["jobs"]
    return mycol

mycol=conn()
#mycol.delete_many({})

headers={
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:79.0) Gecko/20100101 Firefox/79.0",
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
        "X-Requested-With": "XMLHttpRequest",
        "Pragma": "no-cache",
        "Cache-Control": "no-cache"
        #"referrer": "https://www.gxrc.com/jobDetail/465f5dc66aca4eaebe89784fb15b100b",
}

def parse_detail(t=""):
    soup = BeautifulSoup(t)
    return {
         "job_info":soup.find(id="examineSensitiveWordsContent").text,
         "qita":soup.findAll(class_="contact-info one-column")[0].text,
         "contact":soup.findAll(class_="contact-info one-column")[1].text,
         "company_info":soup.find(id="examineSensitiveWordsContent2").text,
         "company_type":soup.find(id="entDetails").find('p').text,
         "address":soup.find(class_="address").text,
         "entDetails":[x.text for x in soup.find(id="entDetails").findAll('p')],
    }

def parse_list(t=""):
    soup = BeautifulSoup(t)
    total=int(soup.find_all(id='pgInfo_last')[0].text)
    o=[]
    for i in soup.find_all(class_="rlOne"):
        z={}
        for q in i.find(class_="qitaUL").findAll('li'):
            z[q.find('strong').text]=q.find('span').text
        r={
            "posName":i.find(class_="posName").text,
            "url":i.find(class_="posName").attrs['href'],
            "posInfo":i.find(class_="posInfo").text,
            "company":i.find(class_="entName").text,
            "company_url":i.find(class_="entName").attrs['href'],
            "money":i.find(class_="w3").text,
            "area":i.find(class_="w4").text,
            "date":i.find(class_="w5").text,
            "qita":z,
          }
        o.append(r)
    return total,o

def get_detail(u):
    u1='https:'+u
    t=requests.get(u1,headers=headers).text
    return parse_detail(t)

def get_list(n=2,keyword="python"):
    u="https://s.gxrc.com/sJob?keyword={}&schType=1&page={}".format(keyword,n)
    t=requests.get(u,headers=headers).text
    total,r=parse_list(t)
    for i in r:
        print("get detail ", i)
        i["detail"]=get_detail(i["url"])
        print("get detail " ,i,"done")
        #time.sleep(1)
    print("page",n,"done")
    return total,r

def save(d=[],file_name="1.json"):
    s=dumps(d,indent=4,ensure_ascii=False)
    with open(file_name,"w")  as f:
        f.write(s)

def list_job():
    r1=list(mycol.find())
    file_name="gxrc_{}.json".format(time.time())
    save(r1,file_name)
    print(r1)
    return r1

def main(n=6):
    for i in range(1,n):
        print('get page ',i,"start")
        total,o=get_list()
        if i>total {
           print("没有了")
           break
        }
        #save to db
        x = mycol.insert_many(o)
        print(x.inserted_ids)

        print('get page ',i,"success ")
        time.sleep(1)
    list_job()


#5页
main(6)
[
    {
        "_id": {
            "$oid": "5f17b9d92c1ee3663087cbe7"
        },
        "posName": "百度SEO专员",
        "url": "//www.gxrc.com/jobDetail/032ac1bce85d454490e3f51ccdb152e0",
        "posInfo": "任职资格:\r\n1、大专以上学历\r\n2、4年以上seo经验\r\n3、会python优先,熟悉应用单站各类cms\r\n4、 熟百度算法与应对方法,最好熟悉站群泛目录\r\n5、至少有一个擅长的有效收录及排名手法。...",
        "company": "深圳市馨瑞科技有限公司",
        "company_url": "//www.gxrc.com/company/1627047",
        "money": "6001-8000",
        "area": "青秀区",
        "date": "2020-07-22",
        "qita": {
            "人数:": "1",
            "学历:": "大专",
            "经验:": "不限",
            "性质:": "民营企业"
        },
        "detail": {
            "job_info": "任职资格:\r\n1、大专以上学历\r\n2、4年以上seo经验\r\n3、会python优先,熟悉应用单站各类cms\r\n4、 熟百度算法与应对方法,最好熟悉站群泛目录\r\n5、至少有一个擅长的有效收录及排名手法。",
            "qita": "\n其它要求\n\n工作性质:全职专业要求:语言/程度:   \n职称要求:无年龄要求:不限更新时间:2020-07-22\n\n",
            "contact": "\n联系方式\n\n联系人:苏小姐\n联系电话:\n电子邮箱:373848499@qq.com\n联系地址:南宁市青秀区联发臻品-1号楼509号\n\n",
            "company_info": " 深圳馨瑞科技有限公司是一家从事软件自主研发,互联网运营的技术企业,公司致力于打造开发全球华人专用的即时通讯APP提供全球华人跨国联系最优质快捷的通讯平台,互联网产品的运营推广外包服务等。",
            "company_type": "民营企业",
            "address": "南宁市青秀区联发臻品-1号楼509号",
            "entDetails": [
                "民营企业",
                "1-50人",
                "互联网",
                "南宁市青秀区联发臻品-1号楼509号",
                ""
            ]
        }
    },

]

你可能感兴趣的:(python)