pyppeteer 使用案例
import time
from asyncio import sleep, get_event_loop
import requests
from pyppeteer import launch
from random import random
from re import compile, S
from faker import Factory
class TaoBaoSpider:
def __init__(self):
self.width, self.height = 1500, 800
get_event_loop().run_until_complete(self.init())
async def init(self):
self.browser = await launch(headless=False,
args=['--disable-infobars', f'--window-size={self.width},{self.height}', "--user-agent={}".format(Factory.create().user_agent()), '--proxy-server='+get_ip()])
self.page = await self.browser.newPage()
await self.page.setViewport({'width': self.width, 'height': self.height})
await self.page.goto('http://ipoipo.cn/page_59.html')
await self.page.evaluate('()=>{Object.defineProperties(navigator,{webdriver:{get:()=>false}})}')
response = await self.page.content()
print(response)
time.sleep(111)
await self.page.close()
@staticmethod
async def login():
await sleep(10)
@property
def sleep_time(self):
return 1 + random() * 3
async def search(self):
await self.page.click('#q')
await sleep(self.sleep_time)
await self.page.keyboard.type('机械革命')
await sleep(self.sleep_time)
await self.page.click('#J_TSearchForm > div.search-button > button')
await sleep(self.sleep_time)
async def crawl(self):
pattern = compile(r'(.*?)', S)
repl_pattern = compile(r'<.*?>|\s+')
for i in range(5):
height = await self.page.evaluate('document.body.clientHeight')
scrolled_height = 0
a = 1 + random()
t = 1
while scrolled_height < height:
scrolled_height = int(1 / 2 * a * t ** 2)
await self.page.evaluate(f'window.scrollTo(0,{scrolled_height})')
t += 1
await sleep(self.sleep_time)
html = await self.page.content()
results = pattern.findall(html)
for result in results:
result = repl_pattern.sub('', result)
print(result)
print()
await sleep(self.sleep_time)
await self.page.click('#mainsrp-pager > div > div > div > ul > li.item.next > a')
await sleep(self.sleep_time)
await sleep(self.sleep_time)
if __name__ == '__main__':
TaoBaoSpider()