[Python爬虫] 5-爬虫进阶(多线程爬虫/动态网页抓取/图形验证码识别)

# I.多线程爬虫
# 1)概念:多线程是为了同步完成多项任务,通过提高资源使用效率来提高系统的效率
#   线程:火车的车厢,进程:火车头
#
# 2)threading模块:专门提供用来做多线程编程的模块
# tips:可在打印时设置这两个打印内容:
    # threading.enumerate():查看当前线程的数量
    # threading.current_thread():查看当前线程的信息
import threading
import time
def coding():
    for i in range(3):
        print("Coding",i)
        time.sleep(1)
def drawing():
    for i in range(3):
        print("Drawing",i)
        time.sleep(1)
def multi_thread():
    t1 = threading.Thread(target=coding)
    t1.start()
    t2 = threading.Thread(target=drawing)
    t2.start()
if __name__ == '__main__':
    multi_thread()

# 3)threading.Thread类的继承
# 目的:更好的封装代码
import threading
import time
class CodingThread(threading.Thread):
    def run(self):
        for i in range(3):
            print("Coding",i,threading.current_thread())
            time.sleep(1)
class DrawingThread(threading.Thread):
    def run(self):
        for i in range(3):
            print("Drawing",i,threading.current_thread())
            time.sleep(1)
def multi_thread():
    t1 = CodingThread()
    t1.start()
    t2 = DrawingThread()
    t2.start()
if __name__ == '__main__':
    multi_thread()

# 4)threading.Lock()的使用
# 多线程共享全局变量时(主要指运算和更改更改,只是调用查看不算),需要在各个线程中对全局变量操作之前进行Lock操作,防止多进程的拥挤导致结果的不准确
import threading
Tlock = threading.Lock()
VALUE = 0
def add_value():
    global VALUE # 表明引入VALUE为全局变量
    Tlock.acquire()
    for i in range(1000000):
        VALUE += 1
    Tlock.release()
    print("VALUE:",VALUE)
def multi_thread():
    for j in range(2): # 设定线程个数
        t = threading.Thread(target=add_value)
        t.start()
if __name__ == '__main__':
    multi_thread()

# 5)Lock版生产者和消费者模式
import threading
import random
import time
TLock = threading.Lock()
TTimes = 0
MONEY = 1000
class Producer(threading.Thread):
    def run(self):
        global MONEY # 表明全局变量MONEY
        global TTimes
        while True:
            earn_money = random.randint(100,1000)
            TLock.acquire()
            if TTimes >= 10:
                TLock.release()
                break
            else:
                MONEY += earn_money
                print("{}赚了{}元,剩余{}元".format(threading.current_thread(),earn_money,MONEY))
                TLock.release()
                TTimes += 1
                time.sleep(0.5)
class Comsumer(threading.Thread):
    def run(self):
        global MONEY  # 表明全局变量MONEY
        while True:
            spend_money = random.randint(100,1500)
            TLock.acquire()
            if MONEY > spend_money:
                MONEY -= spend_money
                print("{}当前取出{}元钱,剩余{}元钱".format(threading.current_thread(),spend_money,MONEY))
            else:
                # 如果钱不够了,有可能是已经超过了次数,这时候就判断一下
                if TTimes >= 10:
                    TLock.release()
                    break
                else:
                    print("{}当前想取{}元钱,剩余{}元钱不足!".format(threading.current_thread(),spend_money,MONEY))
            TLock.release()
            time.sleep(0.5)
def multi_thread():
    for p in range(3):
        Producer(name="生产者线程{}".format(p)).start() # 给线程命名
    for c in range(5):
        Comsumer(name="消费者线程{}".format(c)).start()
if __name__ == '__main__':
    multi_thread()

# 6)Condition版生产者和消费者模式
# 继承自Lock,相较于Lock版本的生产者和消费者模式更加节约资源(Lock版本做了很多无意义的上锁与解锁),常用命令除了acquire和release外还有如下几个命令:
    # 1.wait:将当前线程处于等待状态,并且会释放锁,可以被其他线程使用notify和notify_all唤醒,被唤醒后进入排队模式等待上锁,上所有继续执行后续的代码
    # 2.notify:通知某个正在等待的线程,默认是第一个等待的线程
    # 3.nofify_all:通知所有正在等待的线程
# 注意:notify和nofify_all不会直接释放锁,只是通知目前在等待的去排队等待上锁,!!!并且要在release之前用!!!
# 大体改写思路:在MONEY= 10:
                TCondition.release()
                break
            else:
                MONEY += earn_money
                print("{}赚了{}元,剩余{}元".format(threading.current_thread(),earn_money,MONEY))
                TCondition.notify_all() # 要在release之前
                TCondition.release()
                TTimes += 1
                time.sleep(0.5)
class Consumer(threading.Thread):
    def run(self):
        global MONEY  # 表明全局变量MONEY
        while True:
            spend_money = random.randint(100,1500)
            TCondition.acquire()
            # 这里要给个while循环判断,因为等轮到这个线程的时候,条件有可能又不满足了
            while MONEY < spend_money:
                if TTimes >= 10:
                    TCondition.release()
                    return
                else:
                    print("{}当前想取{}元钱,剩余{}元钱,不足!".format(threading.current_thread(),spend_money,MONEY))
                    TCondition.wait()
            MONEY -= spend_money
            print("{}消费了{}元,剩余{}元".format(threading.current_thread(),spend_money,MONEY))
            TCondition.release()
            time.sleep(0.5)
def multi_thread():
    for p in range(5):
        Producer(name="生产者线程{}".format(p)).start() # 给线程命名
    for c in range(5):
        Consumer(name="消费者线程{}".format(c)).start()
if __name__ == '__main__':
    multi_thread()

# 7)Queue线程安全队列:
# 1.概念:在线程中,访问一些全局变量通常需要加锁,Python内置了一个queue模块,可提供同步,线程安全的队列,主要包括先进先出(FIFO)和后进先出(LIFO)
# 2.常用函数,此处以FIFO为例,LIFO应该如下
# from queue import from queue import LifoQueue
from queue import Queue
q = Queue(4) # 创建一个队列,并设定队列最大长度
q.put(1) # 往队列中添加一个元素
q.qsize() # 查看队列的大小
q.empty() # 查看队列是否为空
q.full() # 查看队列是否为满
q.get() # 取出队列中的元素

# 8)多线程实战1
import requests # 请求url
from lxml import etree # xpath语法解析成分
from urllib import request # request.urlretrieve将文件保存到本地
import re # 利用正则表达式的sub函数将文件名中的特殊符号换掉
import os # 分隔.前后的内容做文件后缀名
from queue import Queue
import threading

class Producder(threading.Thread):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"}
    def __init__(self,page_queue,img_queue,*args,**kwargs): 
        super(Producder,self).__init__(*args,**kwargs) # 从父类(threading.Thread中引入*args及**kwargs,# *args代表任意未知参数,**kwargs代表任意关键字参数,两者结合在一起代表了任意参数)
        self.page_queue = page_queue
        self.img_queue = img_queue
    def run(self):
        while True:
            if self.page_queue.empty():
                break
            else:
                page_url = self.page_queue.get()
                self.prase_page(page_url)
    def prase_page(self,page_url):
        response = requests.get(url=page_url,headers=self.headers)
        text = response.text
        html = etree.HTML(text=text)
        imgs = html.xpath("//div[@class='page-content text-center']//img[@class!='gif']")
        for img in imgs:
            img_url = img.get("data-original")
            img_alt = img.get("alt")
            suffix = os.path.splitext(img_url)[1]
            suffix = re.sub("dta", "", suffix)
            img_file_name = img_alt + suffix
            img_file_name = re.sub("[,?!,。?!']","",img_file_name)
            self.img_queue.put((img_url,img_file_name)) # 以元组的形式放到队列中

class Comsumer(threading.Thread):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"}
    def __init__(self,page_queue,img_queue,*args,**kwargs):
        super(Comsumer,self).__init__(*args,**kwargs)
        self.page_queue = page_queue
        self.img_queue = img_queue
    def run(self):
        while True:
            if self.img_queue.empty() and self.page_queue.empty():
                return
            else:
                img = self.img_queue.get()
                img_url,img_file_name = img # 可以用这种方式获取元组对应位置的内容
                request.urlretrieve(img_url,"images/"+img_file_name)
                print(img_file_name,"下载完成")

def spider():
    page_queue = Queue(10)
    img_queue = Queue(500)
    for i in range(1, 11):
        page_url = "https://www.doutula.com/photo/list/?page={}".format(i)
        page_queue.put(page_url)
    for a in range(5):
        p = Producder(page_queue,img_queue)
        p.start()
    for b in range(5):
        c = Comsumer(page_queue,img_queue)
        c.start()

if __name__ == '__main__':
    spider()

# 9)全局解释器锁(GIL)
# 1)概念:默认使用Cpython解释器(上述所谓的多线程其实均为假象,实际上单个CPU核心轮流处理多个线程╮(╯▽╰)╭)
# 为了保证同一时刻只有一个线程在运行,CPython解释器中有GIL(全局解释器锁),这个解释器锁是有必要的,因为Cpython解释器的内存管理不是线程安全的,当然除了Cpython解释器还有其他解释器(有一些是没有GIL的),如Jpython(无),IronPython(无),PyPy(有)
# 2)注意:虽然是一个假的多线程,但是在处理一些IO操作(InputOutput:文件读写和网络请求等)还是可以很大程度提高效率,但是在做一些CPU计算操作上的不建议使用多线程(渲染模型等),而使用多进程(可以利用多核)

# 10)多线程实战2
import csv
import requests
import threading
from queue import Queue
from lxml import etree

class Producer(threading.Thread):
    def __init__(self,page_queue,joke_queue,*args,**kwargs):
        super(Producer,self).__init__(*args,**kwargs)
        self.page_queue = page_queue
        self.joke_queue = joke_queue
        self.head_url = "http://www.budejie.com"
    def run(self):
        while True:
            if self.page_queue.empty():
                break
            else:
                headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"}
                page_url = self.page_queue.get()
                response = requests.get(url=page_url, headers=headers)
                text = response.text
                html = etree.HTML(text=text)
                contents = html.xpath("//div[@class='j-r-list-c-desc']")
                for content in contents:
                    joke_raw = content.xpath(".//text()")
                    joke = "\n".join(joke_raw).strip()
                    tail_url = content.xpath(".//a/@href")[0]  # 再次强调得到的是一个list
                    joke_url = self.head_url + tail_url
                    self.joke_queue.put((joke,joke_url))

class Consumer(threading.Thread):
    def __init__(self,joke_queue,writer,tlock,*args,**kwargs):
        super(Consumer,self).__init__(*args,**kwargs)
        self.joke_queue = joke_queue
        self.writer = writer
        self.lock = tlock
    def run(self):
        while True:
            try:
                joke_info = self.joke_queue.get(timeout=40)
                joke,joke_url = joke_info
                self.lock.acquire()
                self.writer.writerow((joke,joke_url))
                self.lock.release()
                print("正在保存ing……")
            except:
                break

def spider():
    page_queue = Queue(10)
    joke_queue = Queue(100)
    tlock = threading.Lock()
    file = open("bsbdj.csv","a",newline="",encoding="utf-8")
    writer = csv.writer(file)
    writer.writerow(("joke","joke_url"))
    for i in range(1,11):
        page_url = "http://www.budejie.com/text/{}".format(i)
        page_queue.put(page_url)
    for p in range(5):
        Producer(page_queue,joke_queue).start()
    for c in range(5):
        Consumer(joke_queue,writer,tlock).start()

if __name__ == '__main__':
    spider()
# II.动态网页数据抓取
# 1)AJAX的概念
# AJAX(Asynchronouse JavaScript And XML,即异步JavaScript和XML):传统的网页如果需要更新内容,必须重载整个网页页面,而AJAX技术可以通过在后台与服务器进行少量数据交换,使网页实现异步更新(这意味着可以在不重新加载整个网页的情况下,对网页的某部分进行更新,典型的如“加载更多”等)
# !!!注意!!!:查看网页源代码的方式无法查看利用AJAX新加载的网站,Elements中是可以看到的
 
# 2)AJAX获取方式
# 1.分析接口:直接分析ajax调用的接口。然后通过代码请求这个接口(优:代码量少,性能高,劣:分析接口比较复杂)
# 2.selenium:使用selenium+chromedriver模拟浏览器行为获取数据(优:直接模拟浏览器的行为,劣:代码量多,性能低)
 
# 3)selenium+chromedriver
# 1.概念及安装
# elenium相当于一个机器人,可以模拟人类在浏览器上的行为(安装:pip install selenium)
# chromedriver是一个驱动Chrome浏览器的驱动程序(安装:放在不需要权限的纯英文目录下就可以了)
# 2.编码
from selenium import webdriver
driver_path = r"C:\Users\Administrator\.anaconda\chromedriver\chromedriver.exe"
driver = webdriver.Chrome(executable_path=driver_path)
driver.get("https://www.baidu.com/")
# 2-1.获取网页源代码
get_text = driver.page_source
# !!!注意!!!:如果只是要获取网页中的数据,得到源代码后可以配合基于C语言的lxml进行使用提高效率,如果要对元素进行操作,那就只能用selenium提供的定位元素的方法->见3
# 2-2.关闭页面
driver.close() # 关闭当前页面
driver.quit() # 退出整个浏览器
# 2-3.定位元素
# !!!注意!!!:find_element获取第一个满足条件的元素,find_elements获取所有满足条件的元素,返回一个列表
# !!!注意!!!:这种方法比lxml效率低,但好处在于可以进行后续内容的输入,例:
# find_element_by_id= driver.find_element_by_class_name("s_ipt")
# find_element_by_id.send_keys("hello world")
# 2-3-1.根据id来查找某个元素
find_element_by_id = driver.find_element_by_id("head")
# 2-3-2.通过类名查找元素
find_element_by_class = driver.find_element_by_class_name("head_wrapper")
# 2-3-3.通过name属性值来查找元素
find_element_by_name = driver.find_element_by_name("tj_trxueshu")
# 2-3-4.通过标签名来查找元素
find_element_by_tag = driver.find_element_by_tag_name("div")
# 2-3-5.根据xpath语法来获取元素
find_element_by_xpath = driver.find_element_by_xpath("//div")
# 2-3-6.根据css选择器选择元素
find_element_by_css = driver.find_element_by_css_selector("//div")
# 2-4.操作表单元素
# 2-4-1.操作输入框(搜索框一类):用find_element找到对应的位置->使用send_keys(value)填充数据
inputTag = driver.find_element_by_id("kw")
inputTag.send_keys("python")
inputTag.clear() # 可以清除输入的内容
# 2-4-2.操作checkbox(勾选的格子):用find_element找到对应的位置->使用click勾选
checkTag = driver.find_element_by_name("checkbox")
checkTag.click() # 取消的话再次click即可
# 2-4-3.操作select(下拉菜单):点击之后还要选中,因此引入:selenium.webdriver.support.ui.Select,将获取到的元素当成参数传到这个类当中,然后对其进行选择
from selenium.webdriver.support.ui import Select
selectTag = Select(driver.find_element_by_name("jumpMenu")) # 选中标签,使用select创建对象
selectTag.select_by_index(1) # 根据索引选择
selectTag.select_by_value("http://www.selecttest.com/") # 根据值选择
selectTag.select_by_visible_text("选择这个") # 根据文本选择
# 2-4-4.操作submit(按钮):用find_element找到对应的位置->使用click点击
submitTag = driver.find_element_by_id("su")
submitTag.click()
# 2-5.行为链:使用ActionChains
from selenium.webdriver.common.action_chains import ActionChains
actions = ActionChains(driver)
inputTag = driver.find_element_by_id("kw") # 找到input标签
actions.move_to_element(inputTag) # 光标移动到input标签上
actions.send_keys_to_element(inputTag,"python") # 往其中输入python
submitTag = driver.find_element_by_id("su") # 找到submit标签
actions.move_to_element(submitTag) # 光标移动到submitTag标签上
actions.click(submitTag) # 点击submitTag
actions.perform()
# 2-6.Cookie操作(操作都是基于get的网页)
# 2-6-1.获取所有的cookie:
driver.get_cookies()
# 2-6-2.根据cookie的key获取value:
driver.get_cookie(key)
# 2-6-3.删除所有cookie:
driver.delete_all_cookies()
# 删除某个cookie:
driver.delete_cookie(key)
# 2-7.页面等待:越来越多的网页使用了AJAX技术,如果在页面内容没有加载出来前就使用会报错,所以需要等待
# 2-7-1.隐式等待(指定等待时间)
driver.implicitly_wait()
# 2-7-2.显式等待(WebDriverWait和EC调用)
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
# get到的网页等待十秒钟,除非ID为testid的内容出现
WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.ID,"testid")))
# 2-8.切换页面
driver.execute_script("window.open('https://www.douban.com/')") # 打开一个新的页面
# !!!注意!!!虽然打开了一个新的界面,但driver还是在上一个页面,仍然需要通过driver.window_handles切换到指定窗口
driver.switch_to_window(driver.window_handles[1]) # 利用window_handles选择所要跳转的页面
# 2-9.设置代理ip(或者挂上ss的全局模式)
# https://www.kuaidaili.com/
from selenium import webdriver
driver_path = r"C:\Users\Administrator\.anaconda\chromedriver\chromedriver.exe"
options = webdriver.ChromeOptions()
options.add_argument("--proxy-server=http://121.232.148.138:9000")
driver = webdriver.Chrome(executable_path=driver_path,chrome_options=options)
driver.get("http://httpbin.org/ip") # 测试ip
# 2-10.WebElement元素,获取属性
# 常用:get_attribute(这个标签的某个属性的值)具体的可以查看WebElement的
# driver的对象类,也是继承自WebElement
from selenium.webdriver.remote.webelement import WebElement
submitTag = driver.find_element_by_id("su")
submitTag.get_attribute("value")

# 3)selenium+chromedriver实战1-拉勾网爬虫
import re
import time
from lxml import etree
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

class LagouSpider(object):
    driver_path = r"C:\Users\admin\.anaconda\chromedriver\chromedriver.exe"
    def __init__(self):
        self.driver = webdriver.Chrome(executable_path=self.driver_path)
    # 大脑部分
    # 1.得到职位列表页面,将其内容放入parse_list中解析
    def run(self):
        url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
        self.driver.get(url)
        while True:
            WebDriverWait(driver=self.driver, timeout=10).until(EC.presence_of_element_located((By.XPATH, "//span[contains(@class,'pager_next')]")))  # 确保内容加载出来了
            resource = self.driver.page_source  # 得到页面的内容,等同于F12,不等同于查看网页源代码
            self.parse_list(resource)
            next_btn = self.driver.find_element_by_xpath("//span[contains(@class,'pager_next')]")
            if "pager_next_disabled" in next_btn.get_attribute('class'):
                break
            else:
                next_btn.click()
                time.sleep(1)
    # 2.利用lxml解析各个列表页面,得到具体position_url,并传入parse_position_url函数
    def parse_list(self, resource):
        html = etree.HTML(resource)
        position_urls = html.xpath("//a[@class='position_link']/@href")
        for position_url in position_urls:
            self.parse_position_url(position_url)
            time.sleep(1)
    # 3.利用lxml解析position_url
    def parse_position_url(self, position_url):
        self.driver.execute_script("window.open('" + position_url + "')")  # 需要新建一个窗口,并且转换到改窗口上
        self.driver.switch_to.window(self.driver.window_handles[1])
        WebDriverWait(self.driver, timeout=10).until(EC.presence_of_element_located((By.XPATH, "//dd[@class='job_bt']")))
        resource = self.driver.page_source
        html = etree.HTML(resource)
        position_company = html.xpath("//div[@class='job-name']//div[@class='company']/text()")[0]
        position_name = html.xpath("//div[@class='job-name']//span[@class='name']/text()")[0]
        position_salary = html.xpath("//dd[@class='job_request']//span[@class='salary']/text()")[0]
        position_city_raw = html.xpath("//dd[@class='job_request']//span/text()")[1]
        position_city = re.sub(r"[/\s]", "", position_city_raw)
        position_experience_raw = html.xpath("//dd[@class='job_request']//span/text()")[2]
        position_experience = re.sub(r"[/\s]", "", position_experience_raw)
        position_academic_requirements_raw = html.xpath("//dd[@class='job_request']//span/text()")[3]
        position_academic_requirements = re.sub(r"[/\s]", "", position_academic_requirements_raw)
        position_nature = html.xpath("//dd[@class='job_request']//span/text()")[4]
        position_advantage = html.xpath("//dl[@class='job_detail']//dd[@class='job-advantage']/p/text()")[0]
        position_description_raw = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
        position_description = re.sub("\s", "", position_description_raw)
        position = {"position_company:": position_company, "position_name:": position_name,
                    "position_salary": position_salary,
                    "position_city": position_city, "position_experience": position_experience,
                    "position_academic_requirements:": position_academic_requirements,
                    "position_nature": position_nature, "position_advantage": position_advantage,
                    "position_description": position_description}
        self.driver.close()  # 关闭新建的窗口,并且转换到原来的窗口上
        self.driver.switch_to.window(self.driver.window_handles[0])
        print(position)
if __name__ == '__main__':
    LagouSpider().run()

# 4)selenium+chromedriver实战2-BOSS直聘爬虫
import re
import time
import csv
from selenium import webdriver
from lxml import etree
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

class BossSpider(object):
    driver_path = r"C:\Users\Administrator\.anaconda\chromedriver\chromedriver.exe"
    def __init__(self):
        self.driver = webdriver.Chrome(executable_path=self.driver_path)
    def run(self):
        main_url = "https://www.zhipin.com/job_detail/?query=web%E5%89%8D%E7%AB%AF%E5%BC%80%E5%8F%91&scity=100010000&industry=&position="
        self.driver.get(main_url) # 注意:get要写到while之外,不然会出现1-2-1循环的问题
        print("正在解析main_url")
        while True:
            text = self.driver.page_source
            self.parse_main_url(text)# 解析main_url
            next_page_btn = self.driver.find_element_by_xpath("//div[@class='page']//a[@class='next']")
            if "disabled" in next_page_btn.get_attribute("class"):
                break
            else:
                next_page_btn.click()
    def parse_main_url(self,text):
        html = etree.HTML(text=text)
        head_position_url = "https://www.zhipin.com"
        tail_position_urls = html.xpath("//div[@class='info-primary']//a[position()=1]/@href") # 注意2:要选择位置,不选择位置的话会包含公司主页的url,找到url后要都看一下
        for tail_position_url in tail_position_urls:
            position_url = head_position_url + tail_position_url
            self.parse_position_url(position_url) # 解析position_url
    def parse_position_url(self,position_url):
        self.driver.execute_script("window.open('"+position_url+"')") # 打开一个新页面 position_url,要对url加上引号
        self.driver.switch_to_window(self.driver.window_handles[1])
        self.driver.get(position_url)# 解析
        text = self.driver.page_source
        html = etree.HTML(text=text)
        publish_time = html.xpath("//div[@class='info-primary']//span[@class='time']/text()")[0]
        name = html.xpath("//div[@class='info-primary']//div[@class='name']/h1/text()")[0]
        salary_raw = html.xpath("//div[@class='info-primary']//span[@class='badge']/text()")[0]
        salary = re.sub(r"[\s\n]", "", salary_raw)
        city = html.xpath("//div[@class='info-primary']//p//text()")[0]
        experience = html.xpath("//div[@class='info-primary']//p//text()")[1]
        education = html.xpath("//div[@class='info-primary']//p//text()")[2]
        company_name = html.xpath("//a[@ka='job-detail-company']/text()")[0]
        job_desc_raw = html.xpath("//div[@class='detail-content']/div[@class='job-sec']//div[@class='text']//text()")
        job_desc = "\n".join(job_desc_raw).strip()
        company_desc_raw = html.xpath("//div[@class='detail-content']//div[@class='job-sec company-info']//div[@class='text']//text()")
        company_desc = "\n".join(company_desc_raw).strip()
        position = {"publish_time": publish_time,"company_name": company_name,"name": name, "salary": salary,
                    "city": city, "experience": experience, "education": education, "job_desc": job_desc,"company_desc":company_desc}
        print(position)
        self.driver.close()
        self.driver.switch_to.window(self.driver.window_handles[0])
if __name__ == '__main__':
    BossSpider().run()
# III.图形验证码识别技术:Tesseract
# 1)简介:将图片转化为文字一般称为光学文字识别(OCR),python中有一个比较优秀的图像识别开源库:Tesseract

# 2)使用准备:安装->训练的数据文件路径也放到环境变量中在环境变量中(新建一个:TESSDATA_PREFIX)->使用

# 3)利用命令行使用Tesseract:
# 1.步骤:把tesseract.exe所在的目录放到PATH环境变量中->cmd中切换到图片文件路径->使用命令:tesseract 图片路径 转换文件名
# 例:D:\Tesseract-test>tesseract python.png python
# 2.默认识别的是英文,如果需要识别其他类型的字符串需要先去github上下载相应的训练文件保存到tessdata文件中
# 下载地址:https://github.com/tesseract-ocr/tessdata
# 例:D:\Tesseract-test>tesseract test.png test -l chi_sim
# !!!注意!!!:可以使用tesseract -h查看帮助文档

# 4)利用代码使用Tesseract:
import pytesseract
from PIL import Image # 读取图片用
pytesseract.pytesseract.tesseract_cmd = r"D:\Tesseract-OCR\tesseract.exe" # 指定tesseract.exe路径
image = Image.open(r"D:\Tesseract-test\python.png") # 读取图片
text = pytesseract.image_to_string(image=image) # 可以指定lang参数设定语言类型,同样的,对应的语言库也要先放到tessdata文件中
print(text)

# 5)Tesseract实战-拉勾网验证码识别:
import time
import pytesseract
from PIL import Image
from urllib import request
def identifier():
    num = 0
    while True:
        pytesseract.pytesseract.tesseract_cmd = r"D:\Tesseract-OCR\tesseract.exe"
        url = "https://passport.lagou.com/vcode/create?from=register&refresh=1513081451891"
        request.urlretrieve(url=url, filename=r"D:\Tesseract-test\v_code.png")
        image = Image.open(r"D:\Tesseract-test\v_code.png")
        text = pytesseract.image_to_string(image=image)
        print(text)
        print("=" * 20)
        num += 1
        time.sleep(3)
        if num >=10:
            break
if __name__ == '__main__':
    identifier()
# 附件1:同样的内容用传统方法试了一下拉勾网爬虫
import requests
import re
from lxml import etree
# 1.爬虫之前首先要去网页源代码看看你所需要的内容在不在里面,在的话一般用通常的办法就行了,不在的话要考虑网页采用AJAX技术
# 采用AJAX技术开发的网页实际的url通常不是网站的表面url,要去network中的Ajax.json文件中查找实际的url
def get_position_url():
    request_url = "https://www.lagou.com/jobs/positionAjax.json?city=%E6%9D%AD%E5%B7%9E&needAddtionalResult=false"
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
               "Referer":"https://www.lagou.com/jobs/list_python?city=%E6%9D%AD%E5%B7%9E&cl=false&fromSearch=true&labelWords=&suginput="}
    for i in range(1,11):
        data = {"first": "false","pn": i,"kd": "python"}
        response = requests.post(url=request_url,headers=headers,data=data)
        # 2.用json的方式去解析文件,是不是json文件有一个简单的办法,在1中找到的文件中查看response,将内容去进行https://www.json.cn/验证即可,通过验证的结果找到需要内容对应的标签
        positions_json = response.json()
        positions = positions_json["content"]["positionResult"]["result"]
        for position in positions:
            positionId = position["positionId"] # contents在这三层下面
            position_url = "https://www.lagou.com/jobs/{}.html".format(positionId) # 拼接出对应的position_url
            parse_position_url(position_url)

def parse_position_url(position_url):
    positions = []
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
               "Referer":"https://www.lagou.com/jobs/list_python?city=%E6%9D%AD%E5%B7%9E&cl=false&fromSearch=true&labelWords=&suginput="}
    response = requests.get(url=position_url,headers=headers)
    text = response.text
    html = etree.HTML(text=text)
    position_company = html.xpath("//div[@class='job-name']//div[@class='company']/text()")[0]
    position_name = html.xpath("//div[@class='job-name']//span[@class='name']/text()")[0]
    position_salary = html.xpath("//dd[@class='job_request']//span[@class='salary']/text()")[0]
    position_city_raw = html.xpath("//dd[@class='job_request']//span/text()")[1]
    position_city = re.sub(r"[/\s]","",position_city_raw)
    position_experience_raw = html.xpath("//dd[@class='job_request']//span/text()")[2]
    position_experience = re.sub(r"[/\s]","",position_experience_raw)
    position_academic_requirements_raw = html.xpath("//dd[@class='job_request']//span/text()")[3]
    position_academic_requirements = re.sub(r"[/\s]","",position_academic_requirements_raw)
    position_nature = html.xpath("//dd[@class='job_request']//span/text()")[4]
    position_advantage = html.xpath("//dl[@class='job_detail']//dd[@class='job-advantage']/p/text()")[0]
    position_description_raw = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
    position_description = re.sub("\s","",position_description_raw)
    position = {"position_company:": position_company, "position_name:": position_name, "position_salary": position_salary,
                "position_city": position_city, "position_experience": position_experience, "position_academic_requirements:": position_academic_requirements,
                "position_nature":position_nature,"position_advantage":position_advantage,"position_description":position_description}
    positions.append(position)
    print(positions)

if __name__ == '__main__':
    get_position_url()

 

你可能感兴趣的:(爬虫)