gxrc
pip3 install requests
pip3 install BeautifulSoup
pip3 install pymongo
#打开mongodb服务器
mongod
import requests
from bs4 import BeautifulSoup
import json
import time
import pymongo
from bson.json_util import dumps,loads,RELAXED_JSON_OPTIONS
def conn(u="mongodb://localhost:27017/"):
myclient = pymongo.MongoClient(u)
#dblist = myclient.list_database_names()
mydb = myclient["gxrc"]
#collist = mydb. list_collection_names()
mycol = mydb["jobs"]
return mycol
mycol=conn()
#mycol.delete_many({})
headers={
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:79.0) Gecko/20100101 Firefox/79.0",
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
"X-Requested-With": "XMLHttpRequest",
"Pragma": "no-cache",
"Cache-Control": "no-cache"
#"referrer": "https://www.gxrc.com/jobDetail/465f5dc66aca4eaebe89784fb15b100b",
}
def parse_detail(t=""):
soup = BeautifulSoup(t)
return {
"job_info":soup.find(id="examineSensitiveWordsContent").text,
"qita":soup.findAll(class_="contact-info one-column")[0].text,
"contact":soup.findAll(class_="contact-info one-column")[1].text,
"company_info":soup.find(id="examineSensitiveWordsContent2").text,
"company_type":soup.find(id="entDetails").find('p').text,
"address":soup.find(class_="address").text,
"entDetails":[x.text for x in soup.find(id="entDetails").findAll('p')],
}
def parse_list(t=""):
soup = BeautifulSoup(t)
total=int(soup.find_all(id='pgInfo_last')[0].text)
o=[]
for i in soup.find_all(class_="rlOne"):
z={}
for q in i.find(class_="qitaUL").findAll('li'):
z[q.find('strong').text]=q.find('span').text
r={
"posName":i.find(class_="posName").text,
"url":i.find(class_="posName").attrs['href'],
"posInfo":i.find(class_="posInfo").text,
"company":i.find(class_="entName").text,
"company_url":i.find(class_="entName").attrs['href'],
"money":i.find(class_="w3").text,
"area":i.find(class_="w4").text,
"date":i.find(class_="w5").text,
"qita":z,
}
o.append(r)
return total,o
def get_detail(u):
u1='https:'+u
t=requests.get(u1,headers=headers).text
return parse_detail(t)
def get_list(n=2,keyword="python"):
u="https://s.gxrc.com/sJob?keyword={}&schType=1&page={}".format(keyword,n)
t=requests.get(u,headers=headers).text
total,r=parse_list(t)
for i in r:
print("get detail ", i)
i["detail"]=get_detail(i["url"])
print("get detail " ,i,"done")
#time.sleep(1)
print("page",n,"done")
return total,r
def save(d=[],file_name="1.json"):
s=dumps(d,indent=4,ensure_ascii=False)
with open(file_name,"w") as f:
f.write(s)
def list_job():
r1=list(mycol.find())
file_name="gxrc_{}.json".format(time.time())
save(r1,file_name)
print(r1)
return r1
def main(n=6):
for i in range(1,n):
print('get page ',i,"start")
total,o=get_list()
if i>total {
print("没有了")
break
}
#save to db
x = mycol.insert_many(o)
print(x.inserted_ids)
print('get page ',i,"success ")
time.sleep(1)
list_job()
#5页
main(6)
[
{
"_id": {
"$oid": "5f17b9d92c1ee3663087cbe7"
},
"posName": "百度SEO专员",
"url": "//www.gxrc.com/jobDetail/032ac1bce85d454490e3f51ccdb152e0",
"posInfo": "任职资格:\r\n1、大专以上学历\r\n2、4年以上seo经验\r\n3、会python优先,熟悉应用单站各类cms\r\n4、 熟百度算法与应对方法,最好熟悉站群泛目录\r\n5、至少有一个擅长的有效收录及排名手法。...",
"company": "深圳市馨瑞科技有限公司",
"company_url": "//www.gxrc.com/company/1627047",
"money": "6001-8000",
"area": "青秀区",
"date": "2020-07-22",
"qita": {
"人数:": "1",
"学历:": "大专",
"经验:": "不限",
"性质:": "民营企业"
},
"detail": {
"job_info": "任职资格:\r\n1、大专以上学历\r\n2、4年以上seo经验\r\n3、会python优先,熟悉应用单站各类cms\r\n4、 熟百度算法与应对方法,最好熟悉站群泛目录\r\n5、至少有一个擅长的有效收录及排名手法。",
"qita": "\n其它要求\n\n工作性质:全职专业要求:语言/程度: \n职称要求:无年龄要求:不限更新时间:2020-07-22\n\n",
"contact": "\n联系方式\n\n联系人:苏小姐\n联系电话:\n电子邮箱:373848499@qq.com\n联系地址:南宁市青秀区联发臻品-1号楼509号\n\n",
"company_info": " 深圳馨瑞科技有限公司是一家从事软件自主研发,互联网运营的技术企业,公司致力于打造开发全球华人专用的即时通讯APP提供全球华人跨国联系最优质快捷的通讯平台,互联网产品的运营推广外包服务等。",
"company_type": "民营企业",
"address": "南宁市青秀区联发臻品-1号楼509号",
"entDetails": [
"民营企业",
"1-50人",
"互联网",
"南宁市青秀区联发臻品-1号楼509号",
""
]
}
},
]