我们都知道selenium目前已经停更很长时间了,而且很多网站都已经能够识别selenium了,对于新手来说想要简单的模拟浏览器去采集数据就显得很困难。那么猫哥今天就给大家介绍一款比selenium更好用的浏览器模拟软件pyppeteer。pyppeteer是一位日本工程师根据 Puppeteer 开发出来的非官方版本,而Puppeteer则是由Google开发。具体信息大家可以自行百度一下,猫哥这里就不再啰嗦了,总之一句话,pyppeteer比selenium要强大的多,而且还是异步的,效率也远超selenium。
下面猫哥主要讲一下pyppeteer的常用操作。
async def getbrowser(self):
width, height = 1366, 768
self.browser = await pyppeteer.launch(
headless=False,
# headless=True,
timeout=1500,
# 开发者工具
devtools=False,
dumpio=True,
options={'args':
[
'--no-sandbox',
# 关闭提示条
'--disable-infobars',
f'--window-size={width},{height}',
'--disable-extensions',
'--hide-scrollbars',
'--disable-bundled-ppapi-flash',
'--mute-audio',
'--disable-setuid-sandbox',
'--disable-gpu',
],
}
)
# 无痕模式浏览器
context = await self.browser.createIncogniteBrowserContext()
self.page = await context.browser.newPage()
async def getbrowser(self):
width, height = 1366, 768
# 使用代理IP
proxy_ip = GetIP()
print("当前使用的代理IP是" + proxy_ip)
self.browser = await pyppeteer.launch(
headless=False,
# headless=True,
timeout=1500,
# 开发者工具
devtools=False,
dumpio=True,
options={'args':
[
'--no-sandbox',
# 关闭提示条
'--disable-infobars',
f'--window-size={width},{height}',
'--disable-extensions',
'--hide-scrollbars',
'--disable-bundled-ppapi-flash',
'--mute-audio',
'--disable-setuid-sandbox',
'--disable-gpu',
# f'--proxy-server=111.29.3.186:8080',
'--proxy-server={}'.format(proxy_ip),
],
}
)
# 无痕模式浏览器
context = await self.browser.createIncogniteBrowserContext()
self.page = await context.browser.newPage()
self.user_agent = [
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
"Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
"Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
"Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
"Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
"Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
"Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25",
]
然后在发起请求前定义UA,代码如下。
# UA
await self.page.setUserAgent(random.choice(self.user_agent))
await self.page.setViewport({'width': 1366, 'height': 768})
await self.page.evaluate("""() =>{Object.defineProperties(navigator, {webdriver:{get: () => false}})}""")
await self.page.evaluate('''() => {window.navigator.chrome = {runtime: {}, }; }''')
await self.page.evaluate('''() =>{Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en']});}''')
await self.page.evaluate('''() =>{Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5,6], }); }''')
def input_time_random(self):
return random.randint(100, 161)
await self.page.type('#username', "testdemo", {'delay': self.input_time_random() - 60})
await self.page.type('#password', "123456", {'daelay': self.input_time_random()})
# 等待
await asyncio.sleep(3)
# 鼠标模拟点击(类似于毫无目的的在屏幕上点击,迷惑对方)
self.page.mouse
# 点击提交,并等待页面正确响应。
await asyncio.gather(
self.page.click("#Submit"),
self.page.waitForNavigation(),
)
# 获取tr
tr_list = await self.page.xpath("//div[@class='table-body']/table/tbody/tr")
for tr in tr_list:
# 获取button标签
button = await tr.xpath("./td[(last())]/div/button")
try:
# 点击进入下一页
await button[0].click()
await asyncio.sleep(3)
# 进行其他内容提取操作
# ...
except Exception as e:
print(e)
await self.page.evaluate('window.scrollBy(0, document.body.scrollHeight)')
await self.page.screenshot({'./': 'picture_name.png'})
今天就暂且和大家分享到这里,如果猫哥对pyppeteer有了更多的了解,还会继续和大家一起分享,也欢迎大家分享知识给猫哥!