遇到的问题
1.IP被封
2.csv命名不能使用中文
"""
@author: cht
@time: 2019/7/15 22:23
"""
# -*- coding: utf-8 -*-
import random
import requests
from bs4 import BeautifulSoup
import time
from lxml import etree
import csv
user_agent = [
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52"]
class Boss(object):
def __init__(self):
self.headers = {"User-Agent": random.choice(user_agent)}
# 获取一级子界面职位信息
def gethttpIp(self):
'''
如果不想要购买代理IP,可是舍弃该方法,直接爬取一些代理IP,
'''
httprul = "代理IP的api"
result = requests.get(httprul)
print(result.text)
IPList = result.text.split(":")
print(IPList)
# 代理服务器
proxyHost = IPList[0]
proxyPort = IPList[1].split(" ")[0]
proxyMeta = "https://%(host)s:%(port)s" % {
"host": proxyHost,
"port": proxyPort,
}
proxies = {"https": proxyMeta, }
print(proxies)
return proxies
def get_job_duty(self, url,proxies):
# 获取页面响应内容
res = requests.get(url, headers=self.headers, proxies=proxies)
res.encoding = 'utf-8'
html = res.text
# print(html)
# xpath提取职责和要求
parse_html = etree.HTML(html)
job_info_list = parse_html.xpath('//*[@id="main"]/div/div[2]/ul/li')
# print(job_info_list)
if not job_info_list :
job_info_list = parse_html.xpath('//*[@id="main"]/div/div[3]/ul/li')
# print(job_info_list)
if not job_info_list:
return False
jobInfo = []
try:
for tr in job_info_list:
List1 = []
job_name = tr.xpath("./div/div[1]/h3/a/div[1]/text()")[0]
List1.append(job_name)
job_salary = tr.xpath("./div/div[1]/h3/a/span/text()")[0]
List1.append(job_salary)
company = tr.xpath("./div/div[2]/div/h3/a/text()")[0]
List1.append(company)
address = tr.xpath("./div/div[1]/p/text()[1]")[0]
List1.append(address)
worke_seniority = tr.xpath("./div/div[1]/p/text()[2]")[0]
List1.append(worke_seniority)
degree = tr.xpath("./div/div[1]/p/text()[3]")[0]
List1.append(degree)
job_url = tr.xpath('./div/div[1]/h3/a/@href')[0]
List1.append(job_url)
jobInfo.append(List1)
print(jobInfo)
self.writeCSV(jobInfo)
return jobInfo
except Exception as e:
print("错误原因:%s"%e)
def writeCSV(self,jobInfo):
file = open('C:\\Users\\Administrator\\PycharmProjects\\boss\\java\\java2WorkInfo.csv', 'a', newline='',encoding='gb18030') # 打开文件
content = csv.writer(file, dialect='excel') # 设定文件写入模式
for unitinfo in jobInfo:
content.writerow(unitinfo)
def workOn(self):
cityList = ["c100010000","c101010100","c101020100","c101280100","c101280600","c101210100","c101110100"\
,"c101190400","c101200100","c101230200","c101030100","c101250100","c101270100","c101050100","c101060100","c101070100",\
"c101090100","c101100100","c101120100","c101180100","c101190100","c101190200","c101220100","c101230100",\
"c101240100","c101260100","c101290100","c101310100","c101300100"]
# cityList = ["c101020100"]
for l in cityList:
baseurl = "https://www.zhipin.com/%s?query=java&page="%l
print("city%s开始" % l)
i = 1
proxies = self.gethttpIp()
while i <= 10:
print("第%s页"%i)
url = baseurl+str(i)
print("url:%s" % url)
time.sleep(2)
result = self.get_job_duty(url,proxies)
if result == False:
print("页面为空")
break
i += 1
print("完成city%s"%l)
print("全部结束")
if __name__ == "__main__":
boss = Boss()
boss.workOn()
未完待续