话不多说,直接上代码:
# -*- coding : utf-8 -*-
import requests, time, pymysql, random
from bs4import BeautifulSoup
import re
import json
import datetime
# 伪装成浏览器
user_agent = ['Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.109 Safari/537.36','Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0','Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1','Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;Maxthon2.0)','Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;Trident/4.0;SE2.XMetaSr1.0;SE2.XMetaSr1.0;.NETCLR2.0.50727;SE2.XMetaSr1.0)']
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.109 Safari/537.36',
# 'Connection': "keep-alive"
# }
# 连接数据库
client = pymysql.connect("22222222","testname","testpwdkey","22222" ,use_unicode=True,charset="utf8")
client.autocommit(True)
# 通过cursor创建游标
cursor = client.cursor()
# 获取商品链接
"""
channel:列表地址
page:页数
who_sells:页数参数
"""
def get_all_url_from(channel,start_page, page, who_sells='index',header_rand =0):
headers_a = {
'User-Agent': user_agent[header_rand],
'Connection':"keep-alive",
'X-Forwarded-For':'1.2.3.4',
'referer':'https://yantuz.cn/',
'Content-Type':'multipart/form-data; session_language=cn_CN',
'Accept-Language':'zh-CN,zh;q=0.9',
}
count =0
for iin range(1, page +1):
url ='{}{}={}'.format(channel,str(who_sells),str(i))
print("链接:" + url)
#获取URL内容
wb_data = requests.get(url,headers=headers_a)
time.sleep(3)
wb_data.encoding ='utf-8'
soup = BeautifulSoup(wb_data.text,'html.parser')
links = soup.find("div",class_ ='c_new_table')
all_a_tag = links.find_all("a")
for lin all_a_tag:
if l:
if '/company/pos_' in l.get("href"):
print("page:" +str(i) +",href:" + l.get("href"))
#获取单商品详情
get_iterms_from("http://jn.dazhonghr.com" + l.get("href"),headers_a)
# 获取商品详情
def get_iterms_from(url,headers_s):
wb_data = requests.get(url,headers = headers_s)
if wb_data.status_code ==404:
pass
else:
time.sleep(2)
wb_data.encoding ='utf-8'
soup = BeautifulSoup(wb_data.text,'html.parser')
if soup.select('h1'):
job_name = soup.select('h1')[0].text
if job_name:
daiyu = soup.find("div",class_ ='n_fuli').find_all("li")
company_address = soup.find(id="addr_end")["value"]
new_daiyu =''
for din daiyu:
new_daiyu = new_daiyu +str(d).replace("
company_intro = soup.select('div.n_r_qy')[0].text
company_mingpian_arr = []
company_mingpian = soup.find("div",class_ ='n_r_con').find_all('li')
for kkin company_mingpian:
com_span_str = kk.select("span.f_b")
if com_span_str:
company_mingpian_arr.append(com_span_str[0].text)
n_list = soup.find("div",class_ ='n_list').find_all("li")
n_list_arr = []
for kin n_list:
span_str = k.select("span.f_b")
if span_str:
n_list_arr.append(span_str[0].text)
company_contact = soup.find("div",class_ ="n_l_txt")
job_intro = []
for iin company_contact:
dr = re.compile(r'', re.S)
job_intro.append(str(dr.sub('',str(i))))
company_yaoqiu = soup.find_all("div",class_ ="n_l_txt")
company_yaoqiu_s = BeautifulSoup(str(company_yaoqiu),"html.parser")
company_yaoqiu_arr = []
for tin company_yaoqiu:
dr = re.compile(r'<[^>]+>', re.S)
dd = dr.sub('',str(t))
dd =str(dd).replace('\r','')
company_yaoqiu_arr.append(dd)
# print(company_yaoqiu_arr)
company_user = company_yaoqiu_arr[2].split("\n")
company_users = company_user[1].split(":")
job_username = company_users[1]
company_times = company_user[3].split(":")
job_login_time = company_times[1]
#获取链接ID
link_arr = url.strip().split("_")
link_arr = link_arr[1].split(".")
job_id = link_arr[0]
data = {
'job_name': soup.select('h1')[0].text,
'job_daiyu': new_daiyu,
'company_address_s': company_address,
'job_type': n_list_arr[0],
'min_xueli': n_list_arr[1],
'job_jingyan': n_list_arr[2],
'job_age': n_list_arr[3],
'job_address': n_list_arr[4],
'job_sex': n_list_arr[5],
'job_num': n_list_arr[6],
'type': n_list_arr[7],
'job_money': n_list_arr[8],
'company_name': company_mingpian_arr[0],
'company_type': company_mingpian_arr[1],
'company_money': company_mingpian_arr[2],
'company_num': company_mingpian_arr[3],
'company_reg_time': company_mingpian_arr[4],
'company_intro' : company_intro.strip(),
'job_intro': company_yaoqiu_arr[0],
'job_yaoqiu': company_yaoqiu_arr[1],
'job_username': job_username,
'job_login_time':job_login_time
}
# print(data)
#查询企业信息是否存在
com_sql ="select * from ocenter_crm_company where company_name = '%s'" % (data['company_name'].strip())
cursor.execute(com_sql)
com_result_one = cursor.fetchone()
print(com_result_one)
if com_result_one:
company_id = com_result_one[0]
else:
#添加企业信息
company_add_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
com_insert_sql ="insert into ocenter_crm_company (company_name,company_address,company_type,company_capital,company_scale,company_regtime,company_intro,login_time,add_time) values('%s','%s','%s','%s','%s','%s','%s','%s','%s')" % (data['company_name'],data['company_address_s'],data['company_type'],data['company_money'],data['company_num'],data['company_reg_time'],data['company_intro'],data['job_login_time'],company_add_time)
print(com_insert_sql)
cursor.execute(com_insert_sql)
# 提交SQL
# client.commit()
company_id = cursor.lastrowid
# 执行数据查询
sql ="select * from ocenter_crm_job where job_links = '%s'" % (url.strip())
cursor.execute(sql)
result_one = cursor.fetchone()
if result_one:
print("yes")
else:
# 添加
now_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
sql ="INSERT INTO `ocenter_crm_job` (company_id,job_id,job_links,job_name,job_place,job_experience,job_money,job_number,job_type,min_educational,job_age,job_sex,job_method,job_welfare,job_intro,job_requirement,job_contact,add_time) VALUES ('%d','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" % (company_id,job_id,url.strip(),data['job_name'],data['job_address'],data['job_jingyan'],data['job_money'],data['job_num'],data['job_type'],data['min_xueli'],data['job_age'],data['job_sex'],data['type'],data['job_daiyu'],data['job_intro'],data['job_yaoqiu'],data['job_username'],now_time)
print(sql)
cursor.execute(sql)
# 提交SQL
# client.commit()
else:
pass
else:
pass
# 主体数据处理
#生成随机数
header_rand = random.randint(0,4)
# get_iterms_from('http://www.dazhonghr.com/company/pos_1299928.shtml')
# get_all_url_from('http://jn.dazhonghr.com/person/searchResult.cshtml?',start_page=1, page=100)
# get_all_url_from('http://www.dazhonghr.com/person/searchResult.cshtml?jobclass=1100&trade=1000&eduLev=1000&workYear=1000&salary=1000&workType=1000&x=69&y=20&',start_page=1, page=8)
# job分类
# url = http://jn.dazhonghr.com/person/searchResult.cshtml?jobclass=1100
# job_sql = "select * from ocenter_crm_job_category"
# try:
# # 执行SQL语句
# cursor.execute(job_sql)
# # 获取所有记录列表
# results = cursor.fetchall()
# for row in results:
# new_url = "http://jn.dazhonghr.com/person/searchResult.cshtml?jobclass=%s&" % (row[2])
# get_all_url_from(new_url,start_page=1, page=100)
#
# except:
# print ("Error: unable to fetch data")
# trade分类
# url = http://jn.dazhonghr.com/person/searchResult.cshtml?jobclass=1100
job_sql ="select *from ocenter_crm_trade"
try:
# 执行SQL语句
cursor.execute(job_sql)
# 获取所有记录列表
results = cursor.fetchall()
for rowin results:
new_url ="http://jn.dazhonghr.com/person/searchResult.cshtml?trade=%s&" % (row[2])
get_all_url_from(new_url,start_page=1,page=100,who_sells ='index',header_rand = header_rand)
except:
print ("Error: unable to fetch data")