爬虫抓取总结

2019独角兽企业重金招聘Python工程师标准>>> hot3.png

请求部分

# 页面快 容易被抓 还得继续研究
def get_one_page(url):
    try:
        user_agent = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
        headers = {
            "User-Agnet": user_agent,
            "Connection": "keep-alive",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "Accept-Language": "zhCN,zh;q=0.8,en-US;q=0.5,cnq=0.3",
            # "Content-Encoding": "gzip,deflate",
            "Content-Encoding": "gzip",
            "referer": "https://www.baidu.com/"
        }
        #设置cookie 取不到方法不对
        # cjar = http.cookiejar.CookieJar()
        # print(cjar)     
        time.sleep(1)
        response = requests.get(url, headers=headers)        
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "lxml")
            ok_html = soup.prettify()
            return ok_html
        return response.text
    except RequestException:
        return None


# 取页面慢
def get_page_low(url):
    # 模拟打开浏览器获取 慢
  dpath = "config\chromedriver.exe"
  options = webdriver.ChromeOptions()
  options.add_argument(
      'user-agent="Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"'
  )
  options.add_argument("--headless")
  # options.add_argument("headless")  # 静默浏览器
  # 设置成中文
  # options.add_argument('lang=zh_CN.UTF-8')
  options.add_argument("en-us")
  #谷歌文档提到需要加上这个属性来规避bug
  options.add_argument("--disable-gpu")
  # 禁止加载所有插件,可以增加速度。可以通过about:plugins页面查看效果
  options.add_argument("–disable-plugins")
  # 配了环境变量第一个参数就可以省了,不然传绝对路径
  driver = webdriver.Chrome(executable_path=dpath, chrome_options=options)  
  # driver = webdriver.Chrome(executable_path=dpath)  
  driver.get(url)
  time.sleep(1)  # 等1秒,让页面加载完毕
  html = driver.page_source
  driver.close()
  soup = BeautifulSoup(html, "lxml")
  ok_html = soup.prettify()
  return ok_html

使用线程池抓取,避免线程开的太多卡死了

    
    pool = ThreadPoolExecutor(max_workers=最大运行数量)

    for item in items:
        tc = delCharater(item)
        if tc != "":          
            # 也就是args传递的参数类型不对,即使一个参数也要时元组的形式给出            
            t = pool.submit(调用方法, 参数1, 参数2,参数3)                
            li.append(t)
            time.sleep(random.randint(1, 5))  #随机停顿1-5秒
            # 少跑几个方便测试
            # if (master_count >= 11):
            #   break              
    
    print('等待中.........................................')
    isok = 1    

    for future in as_completed(li):
        data = future.result()
        td = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())        
        print("当前时间 " + td + "完成进度 " + str(isok) + '/' + str(master_num))        
        isok += 1        
        all_data.append(data)

    print("************************主编号全部执行完毕************************")

解析总结

# 取标题
def getItemTitle(html):    
    label=''
    try:
        pattern = re.compile(r"\"productTitle.*?>(.*?)", re.S)
        items = re.findall(pattern, html)        
        if len(items) > 0:
            label = delCharater(items[0])
    except Exception:
        # write_to_file("err_typename_list.html", "w", str(html))
        print('抓取标题出错................')
    return label

源码 https://gitee.com/hj_qingqingde/codes/tuh5dio216y4fesbm0n7q10

转载于:https://my.oschina.net/qingqingdego/blog/3018530

你可能感兴趣的:(爬虫抓取总结)