python学习笔记8---爬虫进阶

多线程概念和threading模块介绍

没有多线程

import threading

def coding():
    for x in range(3):
        print('正在写代码%s'%x)
        time.sleep(1)

def drawing():
    for x in range(3):
        print('正在画图%s'%x)
        time.sleep(1)
def main():
    coding()
    drawing()

if __name__=='__main__':
    main()

多线程

import threading

def coding():
    for x in range(3):
        print('正在写代码%s'%x)
        time.sleep(1)

def drawing():
    for x in range(3)
        print('正在画图%s'%x)
        time.sleep(1)

def main():
    t1 = threading.Thread(target = coding)
    t2 = threading.Thread(target = drawing)
    t1.start()
    t2.start()

if __name__=='__main__':
    main()

使用Thread类创建多线程

# -*- coding: utf-8 -*-
"""
Created on Tue Mar 26 16:37:44 2019

@author: admin
"""

import threading
import time

class CodingThread(threading.Thread):
    def run(self):
        for x in range(3):
            print('正在写代码%s'%threading.current_thread())
            time.sleep(1)
            
            
            
class DrawingThread(threading.Thread):
    def run(self):
        for x in range(3):
            print('正在画图%s'%threading.current_thread())
            time.sleep(1)   
            
def main():
    t1 = CodingThread()
    t2 = DrawingThread()
    
    t1.start()
    t2.start()
    
    
if __name__=='__main__':
    main()

多线程共享全局变量以及锁机制

锁用在改变全局变量的地方,访问全局变量时不需要加锁

import threading

VALUE = 0
gLock = threading.Lock()
def add_value():
    # 函数内部改变全局变量值时要加下面这行
    global VALUE
    gLock.acquire()
    for x in range(100000):
        VALUE +=1
    gLock.release()    
    print('value:%d'%VALUE)
    
    
def main():
    for  x in range(2):
        t=threading.Thread(target=add_value)
        t.start()
        
if __name__=='__main__':
    main()
 

Lock版生产者和消费者

# -*- coding: utf-8 -*-
"""
Created on Wed Mar 27 10:36:15 2019

@author: admin
"""

import threading
import time
import random


gMoney=1000
gLock =threading.Lock()
gTotalTimes=10
gTimes=0

class Producer(threading.Thread):
    def run(self):
        global gMoney
        global gTimes
        while True:
            money = random.randint(100,1000)
            gLock.acquire()
            if gTimes>=gTotalTimes:
                gLock.release()
                break
            gMoney += money
            print('%s生产了%d元钱,剩余%d元钱'%(threading.current_thread(),money,gMoney))
            gTimes += 1
            gLock.release()
            time.sleep(0.5)
    
class Consumer(threading.Thread):
    def run(self):
        global gMoney
        
        while True:
            money = random.randint(100,1000)
            gLock.acquire()
            if gMoney>=money:
                gMoney-=money
                print('%s消费者消费了%d元钱,剩余%d元钱'%(threading.current_thread(),money,gMoney))
            else:
                if gTimes>=gTotalTimes:
                    gLock.release()
                    break
                print('%s消费者准备消费%d元钱,剩余%d元钱,不足!'%(threading.current_thread(),money,gMoney))
            gLock.release()
            time.sleep(0.5)
            
    
    
def main():
    
    for x in range(3):
        t=Consumer(name="消费者线程%d"%x)
        t.start()
    
    for x in range(5):
        t= Producer(name="生产者线程%d"%x)
        t.start() 
        
if __name__=='__main__':
    main()

condition版生产者和消费者

# -*- coding: utf-8 -*-
"""
Created on Wed Mar 27 10:36:15 2019

@author: admin
"""

import threading
import time
import random


gMoney=1000
gCondition =threading.Condition()
gTotalTimes=10
gTimes=0

class Producer(threading.Thread):
    def run(self):
        global gMoney
        global gTimes
        while True:
            money = random.randint(100,1000)
            gCondition.acquire()
            if gTimes>=gTotalTimes:
                gCondition.release()
                break
            gMoney += money
            print('%s生产了%d元钱,剩余%d元钱'%(threading.current_thread(),money,gMoney))
            gTimes += 1
            gCondition.notify_all()
            
            gCondition.release()
            time.sleep(0.5)
    
class Consumer(threading.Thread):
    def run(self):
        global gMoney
        
        while True:
            money = random.randint(100,1000)
            gCondition.acquire()
            while gMoney=gTotalTimes:
                    gCondition.release()
                    return
                print('%s消费者准备消费%d元钱,剩余%d元钱,不足!'%(threading.current_thread(),money,gMoney))
                gCondition.wait()
            gMoney-=money
            print('%s消费者消费了%d元钱,剩余%d元钱'%(threading.current_thread(),money,gMoney))
            gCondition.release()
            time.sleep(0.5)
            
    
    
def main():
    
    for x in range(3):
        t=Consumer(name="消费者线程%d"%x)
        t.start()
    
    for x in range(5):
        t= Producer(name="生产者线程%d"%x)
        t.start()
        
        
if __name__=='__main__':
    main()

Queue线程安全队列

如果想把一些数据存储到某个队列中,那么python内置了一个线程安全的模块叫queue模块。python中queue模块中提供了同步的、线程安全的队列类,包括FIFO队列Queue,LIFO队列LifoQueue。这些队列都实现了锁原语(可以理解为原子操作,要么不做,要么都做完),能过在多线程中直接使用。可以使用队列来实现线程间的同步。

不使用线程爬取斗图啦图片

# -*- coding: utf-8 -*-
"""
Created on Wed Mar 27 11:52:37 2019

@author: admin
"""
import requests
from lxml import etree
from urllib import request
import re
import os


def parse_page(url):
    headers={
            'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36"
            }
    response = requests.get(url,headers)
    text=response.text
    html=etree.HTML(text)
    imgs = html.xpath("//div[@class='page-content text-center']//img[@class!='gif']")
    for img in imgs:
        img_url = img.get('data-original')
        alt = img.get('alt')
        alt = re.sub(r'[\??\.!!::,]','',alt)
        suffix = os.path.splitext(img_url)[1].replace('!dta','')
        filename = alt + suffix
        # 要自己创建images文件夹
        request.urlretrieve(img_url,'images/'+filename)


def main():
    for x in range(1,101):
        url='https://www.doutula.com/photo/list/?page=%d'%x
        parse_page(url)       


if __name__=='__main__':
    main()

GIL全局解释器锁详解

python自带的解释器是cpython。cpython解释器的多线程实际是个假的多线程(在多核CPU中,只能用一核)。同一时刻只有一个线程在执行,为了保证同一时刻只有一个线程在执行,在cpython解释器中有GIL。其他解释器:

  1. Jpython:Java实现的Python解释器,不存在GIL锁。
  2. IronPython:用.net实现的解释器,不存在GIL
  3. PyPy:用Python实现的Python解释器,存在GIL解释器。
    GIL虽然是个假的多线程。但在处理IO操作(文件读写和网络请求)可以提高效率。在一些CPU计算操作上不建议使用多线程,建议使用多进程。

百思不得姐段子爬取

在这里插入代码片

ajax介绍和爬取ajax数据的两种方式

AJAX是异步JavaScript和XML。ajax可以实现网页异步更新,可以不重新加载整个网页,对网页的某部分进行更新(比如点击小饭桌网页中的加载更多,查看网页源代码没有看不到加载更多的内容,而在element中可以看到加载更多的内容)。传统网页(不使用ajax)必须重载整个网页才能更新内容。因为传统网页传输数据格式使用XML语法,所以叫ajax,现在数据交互基本上都是JSON但是一直沿用这种叫法。

获取ajax数据的方式

1.直接分析ajax调用的接口。然后通过代码请求这个接口。
2. 使用Selenium+chromedriver模拟浏览器行为获取数据

方式 优点 缺点
分析接口 直接可以请求到数据,不需要解析,代码量少,性能高 分析接口比较复杂,特别是通过js混淆的接口,且容易被发现是爬虫
selenium 浏览器能请求到的,使用selenium也能请求到,爬虫更稳定 代码量多,性能低

不同浏览器对应的driver
http://www.sites.google.com/a/chromium.org/chromedriver/downloads
在这里插入图片描述安装chromedriver:下载完成后,放在没有权限的纯英文路径下即可

selenium关闭页面和浏览器

from selenium import webdriver
import time

driver_path = r'D:\tools\chromedriver.exe'
driver = webdriver.Chrome(executable_path = driver_path)
driver.get("https://www.baidu.com/")
inputTag  =driver.find

根据属性查找元素

from selenium import webdriver

driver_path = r'D:\tools\chromedriver.exe'
driver = webdriver.Chrome(executable_path = driver_path)
driver.get("https://www.baidu.com/")
#inputTag=driver.find_element_by_id('kw')
#inputTag=driver.find_element_by_name('wd')

inputTag = driver.find_element_by_css_selector(".quickdelete-wrap > input")[0]
inputTag.send_keys('python')
#如果只是解析网页中的数据,推荐将网页源代码扔给lxml(lxml底层用的c语言,解析效率更高)解析
#如果对元素进行输入、点击某个按钮等操作,则必须使用selenium

4种方式操作select标签


from selenium import webdriver
from selenium.webdriver.support.ui import Select

driver_path = r'D:\tools\chromedriver.exe'
driver = webdriver.Chrome(executable_path = driver_path)
driver.get("http://www.dobai.cn/")

#通过元素名称
selectBtn = Select(driver.find_element_by_name('jumpMenu'))
#通过索引
selectBtn.select_by_index(0)
#通过value属性
selectBtn.select_by_value("http://m.95xiu.com/")
#通过可见文本
selectBtn.select_by_visible_text("95秀客户端")

百度python


from selenium import webdriver

driver_path = r'D:\tools\chromedriver.exe'
driver = webdriver.Chrome(executable_path = driver_path)
driver.get("http://www.baidu.com/")

inputTag = driver.find_element_by_id('kw')
inputTag.send_keys('python')

submitTag = driver.find_element_by_id('su')
submitTag.click()

selenium行为链

from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
 
driver_path = r'D:\tools\chromedriver.exe'
driver = webdriver.Chrome(executable_path = driver_path)
driver.get("http://www.baidu.com/")

inputTag = driver.find_element_by_id('kw')
submitBtn = driver.find_element_by_id('su')

actions = ActionChains(driver)
actions.move_to_element(inputTag)
actions.send_keys_to_element(inputTag,'python')
actions.send_keys_to_element(submitBtn)
actions.click()
actions.perform()

selenium操作cookie

from selenium import webdriver
 
driver_path = r'D:\tools\chromedriver.exe'
driver = webdriver.Chrome(executable_path = driver_path)
driver.get("http://www.baidu.com/")

#获取所有cookie
for cookie in driver.get_cookies():
    print(cookie)
    
#根据cookie的key获取value    
print(driver.get_cookie("PSTM"))

#删除所有cookie
driver.delete_all_cookies()

#删除某个cookie
driver.delete_cookie("PSTM")

selenium的隐式等待和显示等待

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
 
driver_path = r'D:\tools\chromedriver.exe'
driver = webdriver.Chrome(executable_path = driver_path)
driver.get("http://www.douban.com/")

#先隐式等待10秒,如果没有id为hfsjfjkghfj的元素就抛出异常
driver.implictity_wait(10)
driver.find_element_by_id("hfsjfjkghfj")

#显式等待,元组的形式(By.ID,'form_email')
element = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.ID,'form_email')))
print(element)

selenium打开多窗口和切换窗口

from selenium import webdriver

driver_path=r'D:\tools\chromedriver.exe'
driver = webdriver.Chrome(executable_path=driver_path)
driver.get('https://www.baidu.com/')
#只是页面指向豆瓣,但实际url仍是百度
driver.execute_script("window.open('https://www.douban.com/')")
#做爬虫时实际指向索引为1(根据打开的顺序,百度为0)的豆瓣页面
driver.switch_to_window(driver.window_handles[1])
print(driver.current_url)#查看driver实际指向哪个窗口
print(driver.page_source)

seleniumm使用代理

from selenium import webdriver

driver_path = r'D:\tools\chromedriver.exe'
options = webdriver.ChromeOptions()
options.add_argument("--proxy-server=http://171.41.82.12:9999")

driver = webdriver.Chrome(executable_path =driver_path,chrome_options=options)
driver.get("http://httpbin.org/ip")

拉勾网全国python岗位爬虫

# -*- coding: utf-8 -*-
"""
Created on Thu Apr 11 15:34:47 2019

@author: admin
"""
from selenium import webdriver
from lxml import etree
import re
import csv
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

class LagouSpider(object):
    
    driver_path = r"D:\tools\chromedriver.exe"
    def __init__(self):
        self.driver = webdriver.Chrome(executable_path=LagouSpider.driver_path)
        self.url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
        self.positions=[]
        fp=open('lagou.csv','a',newline='',encoding='utf-8')
        self.writer=csv.DictWriter(fp,['name','salary','company_name','location','work_experience','education','job_desc'])
        self.writer.writeheader()
        
    def run(self):
        self.driver.get(self.url)
        while True:            
            source = self.driver.page_source
            WebDriverWait(driver= self.driver,timeout=50).until(EC.presence_of_all_elements_located((By.XPATH,"//div[@class='pager_container']/span[last()]")))
            self.parse_list_page(source)
            try:
                next_btn = self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]")
                if "pager_next_disabled" in next_btn.get_attribute("class"):
                    break
                else:
                    next_btn.click() 
            except:
                print(source)
            time.sleep(2)
        
    def parse_list_page(self,source):
        html = etree.HTML(source)
        links = html.xpath("//a[@class='position_link']/@href")
        for link in links:
            self.request_detail_page(link)
            time.sleep(2)
            
    
    def request_detail_page(self,url):
        self.driver.execute_script("window.open('%s')" %url)
        self.driver.switch_to.window(self.driver.window_handles[1])        

        WebDriverWait(driver= self.driver,timeout=50).until(EC.presence_of_all_elements_located((By.XPATH,"//span[@class='name']")))
        source = self.driver.page_source
        self.parse_detail_page(source)
        self.driver.close()
        self.driver.switch_to.window(self.driver.window_handles[0])       

    def parse_detail_page(self,source):
        html = etree.HTML(source)
        position_name=html.xpath("//span[@class='name']/text()")[0]
        job_request_spans = html.xpath("//dd[@class='job_request']//span")
        salary = job_request_spans[0].xpath('.//text()')[0].strip()
        city = job_request_spans[1].xpath(".//text()")[0].strip()
        city = re.sub(r"[\s/]","",city)
        work_years = job_request_spans[2].xpath(".//text()")[0].strip()
        work_years = re.sub(r"[\s/]","",work_years)
        desc ="".join(html.xpath("//dd[@class='job_bt']//text()")).strip()    
        position={
                'name':position_name,
                'salary':salary,
                'city':city,
                'work_years':work_years,
                'desc':desc                
                
                }
        self.write_position(position)
      
       
    def write_position(self,position):
        self.writer.writerow(position)
        print(position)
               
    
if __name__=='__main__':
    spider = LagouSpider()
    spider.run()
    
  • 因为拉勾网的爬虫机制,爬取到第二页需要登录

Boss直聘全国python岗位爬虫

# -*- coding: utf-8 -*-
"""
Created on Wed Apr 24 09:56:14 2019

@author: admin
"""

from lxml import etree
from selenium import webdriver
import time
import re
import csv
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

class BossSpider(object):
    driver_path=r"D:\tools\chromedriver.exe"    
    def __init__(self):
        
        self.driver=webdriver.Chrome(executable_path=BossSpider.driver_path)
        #包含所有职位信息的页面,取Request URL的值
        self.url="https://www.zhipin.com/job_detail/?query=python&city=100010000&industry=&position="
        #容易忽略报WebDriverException: unknown error: unhandled inspector error:
        #得到url后与网页中url比较是否缺前面的域名
        self.domain="https://www.zhipin.com"

        fp=open('boss.csv','a',newline='',encoding='utf-8')
        self.writer=csv.DictWriter(fp,['name','salary','company_name','location','work_experience','education','job_desc'])
        self.writer.writeheader()

    def run(self):
        self.driver.get(self.url)
        while True:
            #不同于右键查看源代码,可以看到所有职位信息
            source=self.driver.page_source
            WebDriverWait(driver= self.driver,timeout=50).until(EC.presence_of_all_elements_located((By.XPATH,"//a[contains(@class,'next')]")))

            self.parse_list_page(source)
            next_btn=self.driver.find_element_by_xpath("//a[contains(@class,'next')]")
            #如果next_btn的class属性是next disabled
            if "disabled" in next_btn.get_attribute("class"):
                break
            else:                
                next_btn.click()
            time.sleep(1)
        
        
    def parse_list_page(self,source):
        '''
        获取职位链接
        '''
        html=etree.HTML(source)
        #获取详情页面的url,即一页30个职位
        links=html.xpath("//div[@class='info-primary']//a[position()=1]/@href")
        for link in links:
            url=self.domain+link
            self.request_detail_page(url)
            time.sleep(1)
        
    def request_detail_page(self,url):
        self.driver.execute_script("window.open('%s')" %url)
        self.driver.switch_to.window(self.driver.window_handles[1])
        
        source = self.driver.page_source
        self.parse_detail_page(source)
        self.driver.close()
        self.driver.switch_to.window(self.driver.window_handles[0])
        
    def parse_detail_page(self,source):
        html=etree.HTML(source)
        position_name=html.xpath("//div[@class='info-primary']/div[@class='name']/h1/text()")[0]
        salary=html.xpath("//div[@class='name']/span[@class='salary']/text()")[0]
        salary=re.sub(r"[\s/]","",salary)
        company_name=html.xpath("//a[@ka='job-detail-company']/text()")[1].strip()
        location=html.xpath("//div[@class='info-primary']/p[position()=1]/text()")[0]
        work_experience=html.xpath("//div[@class='info-primary']/p[position()=1]/text()")[1]
        education=html.xpath("//div[@class='info-primary']/p[position()=1]/text()")[2]
        job_desc="\n".join(html.xpath("//div[@class='detail-content']/div[@class='job-sec'][position()=1]/div[@class='text']/text()")).strip()
        position={
                'name':position_name,
                'salary':salary,
                'company_name':company_name,
                'location':location,
                'work_experience':work_experience,
                'education':education,
                'job_desc':job_desc  
                
                }
        self.write_position(position)      
       
    def write_position(self,position):
        self.writer.writerow(position)
        print(position)        
    
if __name__=='__main__':
    spider=BossSpider()
    spider.run()
  • 职位信息写入csv中速度很慢,可能在程序停止后写入,而且csv中存在中文乱码

你可能感兴趣的:(python)