import requests import re from bs4 import BeautifulSoup import math headers = { "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Mobile Safari/537.36" } # 获取总页数 def getPageNum(url): res = requests.get(url,headers=headers).text sup = BeautifulSoup(res,'lxml') # 获取岗位总数 num = sup.select(".lightblue.total")[0].text # 向上取整数,返回总页数 return math.ceil(int(num)/10) def getJobInfo(url): # url ="https://hr.tencent.com/position.php?keywords=python&lid=0&tid=0&start=10#a" res = requests.get(url, headers = headers).text soup = BeautifulSoup(res ,'lxml') # 获取节点 jobList = soup.find_all('tr',class_ = ['even','odd']) strurl = 'https://hr.tencent.com/' for job in jobList: jobName = job.select('td:nth-of-type(1) > a')[0].text jobUrl = strurl + job.select('td:nth-of-type(1) > a')[0]['href'] jobType = job.select('td:nth-of-type(2)')[0].text jobnum = job.select('td:nth-of-type(3)')[0].text jobAddr = job.select('td:nth-of-type(4)')[0].text jobTime = job.select('td:nth-of-type(5)')[0].text print(jobName,jobUrl,jobType,jobnum,jobAddr,jobTime) # 存入到txt文件 with open('tengxunzhaopin.txt','a+',encoding='utf-8',errors='ignore') as f: # 强制转为str类型 f.write(str((jobName,jobUrl,jobType,jobnum,jobAddr,jobTime)) + '\n') f.flush() if __name__ == '__main__': url = "https://hr.tencent.com/position.php?keywords=python&lid=0&tid=0" pegenum = getPageNum(url) print(pegenum) #构建翻页链接 for i in range(pegenum): url = "https://hr.tencent.com/position.php?keywords=python&lid=0&tid=0&start=d%#a"%(i*10) getJobInfo(url)
----------------------------------------------------------------------------------------
import pymysql db = pymysql.connect(host="127.0.0.1",port=3306, user="root",password="xxxxx", db="pydata201806",charset="utf8") cur = db.cursor() with open('tengxunzhaopin.txt','r',errors='ignore',encoding='utf-8') as f: jobList = f.readlines() for job in jobList: job = eval(job) sql = "INSERT INTO tenxunzhap(jobName, jobUrl, jobType, jobNum, jobAddr, jobTime) VALUES " \ "(%r,%r,%r,%r,%r,%r)" % (job[0], job[1], job[2], job[3], job[4], job[5]) print(job) print(sql) cur.execute(sql) db.commit() cur.close() db.close()