2019独角兽企业重金招聘Python工程师标准>>>
请求部分
# 页面快 容易被抓 还得继续研究
def get_one_page(url):
try:
user_agent = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
headers = {
"User-Agnet": user_agent,
"Connection": "keep-alive",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "zhCN,zh;q=0.8,en-US;q=0.5,cnq=0.3",
# "Content-Encoding": "gzip,deflate",
"Content-Encoding": "gzip",
"referer": "https://www.baidu.com/"
}
#设置cookie 取不到方法不对
# cjar = http.cookiejar.CookieJar()
# print(cjar)
time.sleep(1)
response = requests.get(url, headers=headers)
if response.status_code == 200:
soup = BeautifulSoup(response.text, "lxml")
ok_html = soup.prettify()
return ok_html
return response.text
except RequestException:
return None
# 取页面慢
def get_page_low(url):
# 模拟打开浏览器获取 慢
dpath = "config\chromedriver.exe"
options = webdriver.ChromeOptions()
options.add_argument(
'user-agent="Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"'
)
options.add_argument("--headless")
# options.add_argument("headless") # 静默浏览器
# 设置成中文
# options.add_argument('lang=zh_CN.UTF-8')
options.add_argument("en-us")
#谷歌文档提到需要加上这个属性来规避bug
options.add_argument("--disable-gpu")
# 禁止加载所有插件,可以增加速度。可以通过about:plugins页面查看效果
options.add_argument("–disable-plugins")
# 配了环境变量第一个参数就可以省了,不然传绝对路径
driver = webdriver.Chrome(executable_path=dpath, chrome_options=options)
# driver = webdriver.Chrome(executable_path=dpath)
driver.get(url)
time.sleep(1) # 等1秒,让页面加载完毕
html = driver.page_source
driver.close()
soup = BeautifulSoup(html, "lxml")
ok_html = soup.prettify()
return ok_html
使用线程池抓取,避免线程开的太多卡死了
pool = ThreadPoolExecutor(max_workers=最大运行数量)
for item in items:
tc = delCharater(item)
if tc != "":
# 也就是args传递的参数类型不对,即使一个参数也要时元组的形式给出
t = pool.submit(调用方法, 参数1, 参数2,参数3)
li.append(t)
time.sleep(random.randint(1, 5)) #随机停顿1-5秒
# 少跑几个方便测试
# if (master_count >= 11):
# break
print('等待中.........................................')
isok = 1
for future in as_completed(li):
data = future.result()
td = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
print("当前时间 " + td + "完成进度 " + str(isok) + '/' + str(master_num))
isok += 1
all_data.append(data)
print("************************主编号全部执行完毕************************")
解析总结
# 取标题
def getItemTitle(html):
label=''
try:
pattern = re.compile(r"\"productTitle.*?>(.*?)", re.S)
items = re.findall(pattern, html)
if len(items) > 0:
label = delCharater(items[0])
except Exception:
# write_to_file("err_typename_list.html", "w", str(html))
print('抓取标题出错................')
return label
源码 https://gitee.com/hj_qingqingde/codes/tuh5dio216y4fesbm0n7q10