没有多线程
import threading
def coding():
for x in range(3):
print('正在写代码%s'%x)
time.sleep(1)
def drawing():
for x in range(3):
print('正在画图%s'%x)
time.sleep(1)
def main():
coding()
drawing()
if __name__=='__main__':
main()
多线程
import threading
def coding():
for x in range(3):
print('正在写代码%s'%x)
time.sleep(1)
def drawing():
for x in range(3)
print('正在画图%s'%x)
time.sleep(1)
def main():
t1 = threading.Thread(target = coding)
t2 = threading.Thread(target = drawing)
t1.start()
t2.start()
if __name__=='__main__':
main()
使用Thread类创建多线程
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 26 16:37:44 2019
@author: admin
"""
import threading
import time
class CodingThread(threading.Thread):
def run(self):
for x in range(3):
print('正在写代码%s'%threading.current_thread())
time.sleep(1)
class DrawingThread(threading.Thread):
def run(self):
for x in range(3):
print('正在画图%s'%threading.current_thread())
time.sleep(1)
def main():
t1 = CodingThread()
t2 = DrawingThread()
t1.start()
t2.start()
if __name__=='__main__':
main()
锁用在改变全局变量的地方,访问全局变量时不需要加锁
import threading
VALUE = 0
gLock = threading.Lock()
def add_value():
# 函数内部改变全局变量值时要加下面这行
global VALUE
gLock.acquire()
for x in range(100000):
VALUE +=1
gLock.release()
print('value:%d'%VALUE)
def main():
for x in range(2):
t=threading.Thread(target=add_value)
t.start()
if __name__=='__main__':
main()
Lock版生产者和消费者
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 27 10:36:15 2019
@author: admin
"""
import threading
import time
import random
gMoney=1000
gLock =threading.Lock()
gTotalTimes=10
gTimes=0
class Producer(threading.Thread):
def run(self):
global gMoney
global gTimes
while True:
money = random.randint(100,1000)
gLock.acquire()
if gTimes>=gTotalTimes:
gLock.release()
break
gMoney += money
print('%s生产了%d元钱,剩余%d元钱'%(threading.current_thread(),money,gMoney))
gTimes += 1
gLock.release()
time.sleep(0.5)
class Consumer(threading.Thread):
def run(self):
global gMoney
while True:
money = random.randint(100,1000)
gLock.acquire()
if gMoney>=money:
gMoney-=money
print('%s消费者消费了%d元钱,剩余%d元钱'%(threading.current_thread(),money,gMoney))
else:
if gTimes>=gTotalTimes:
gLock.release()
break
print('%s消费者准备消费%d元钱,剩余%d元钱,不足!'%(threading.current_thread(),money,gMoney))
gLock.release()
time.sleep(0.5)
def main():
for x in range(3):
t=Consumer(name="消费者线程%d"%x)
t.start()
for x in range(5):
t= Producer(name="生产者线程%d"%x)
t.start()
if __name__=='__main__':
main()
condition版生产者和消费者
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 27 10:36:15 2019
@author: admin
"""
import threading
import time
import random
gMoney=1000
gCondition =threading.Condition()
gTotalTimes=10
gTimes=0
class Producer(threading.Thread):
def run(self):
global gMoney
global gTimes
while True:
money = random.randint(100,1000)
gCondition.acquire()
if gTimes>=gTotalTimes:
gCondition.release()
break
gMoney += money
print('%s生产了%d元钱,剩余%d元钱'%(threading.current_thread(),money,gMoney))
gTimes += 1
gCondition.notify_all()
gCondition.release()
time.sleep(0.5)
class Consumer(threading.Thread):
def run(self):
global gMoney
while True:
money = random.randint(100,1000)
gCondition.acquire()
while gMoney=gTotalTimes:
gCondition.release()
return
print('%s消费者准备消费%d元钱,剩余%d元钱,不足!'%(threading.current_thread(),money,gMoney))
gCondition.wait()
gMoney-=money
print('%s消费者消费了%d元钱,剩余%d元钱'%(threading.current_thread(),money,gMoney))
gCondition.release()
time.sleep(0.5)
def main():
for x in range(3):
t=Consumer(name="消费者线程%d"%x)
t.start()
for x in range(5):
t= Producer(name="生产者线程%d"%x)
t.start()
if __name__=='__main__':
main()
如果想把一些数据存储到某个队列中,那么python内置了一个线程安全的模块叫queue模块。python中queue模块中提供了同步的、线程安全的队列类,包括FIFO队列Queue,LIFO队列LifoQueue。这些队列都实现了锁原语(可以理解为原子操作,要么不做,要么都做完),能过在多线程中直接使用。可以使用队列来实现线程间的同步。
不使用线程爬取斗图啦图片
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 27 11:52:37 2019
@author: admin
"""
import requests
from lxml import etree
from urllib import request
import re
import os
def parse_page(url):
headers={
'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36"
}
response = requests.get(url,headers)
text=response.text
html=etree.HTML(text)
imgs = html.xpath("//div[@class='page-content text-center']//img[@class!='gif']")
for img in imgs:
img_url = img.get('data-original')
alt = img.get('alt')
alt = re.sub(r'[\??\.!!::,]','',alt)
suffix = os.path.splitext(img_url)[1].replace('!dta','')
filename = alt + suffix
# 要自己创建images文件夹
request.urlretrieve(img_url,'images/'+filename)
def main():
for x in range(1,101):
url='https://www.doutula.com/photo/list/?page=%d'%x
parse_page(url)
if __name__=='__main__':
main()
python自带的解释器是cpython。cpython解释器的多线程实际是个假的多线程(在多核CPU中,只能用一核)。同一时刻只有一个线程在执行,为了保证同一时刻只有一个线程在执行,在cpython解释器中有GIL。其他解释器:
在这里插入代码片
AJAX是异步JavaScript和XML。ajax可以实现网页异步更新,可以不重新加载整个网页,对网页的某部分进行更新(比如点击小饭桌网页中的加载更多,查看网页源代码没有看不到加载更多的内容,而在element中可以看到加载更多的内容)。传统网页(不使用ajax)必须重载整个网页才能更新内容。因为传统网页传输数据格式使用XML语法,所以叫ajax,现在数据交互基本上都是JSON但是一直沿用这种叫法。
1.直接分析ajax调用的接口。然后通过代码请求这个接口。
2. 使用Selenium+chromedriver模拟浏览器行为获取数据
方式 | 优点 | 缺点 |
---|---|---|
分析接口 | 直接可以请求到数据,不需要解析,代码量少,性能高 | 分析接口比较复杂,特别是通过js混淆的接口,且容易被发现是爬虫 |
selenium | 浏览器能请求到的,使用selenium也能请求到,爬虫更稳定 | 代码量多,性能低 |
不同浏览器对应的driver
http://www.sites.google.com/a/chromium.org/chromedriver/downloads
安装chromedriver:下载完成后,放在没有权限的纯英文路径下即可
from selenium import webdriver
import time
driver_path = r'D:\tools\chromedriver.exe'
driver = webdriver.Chrome(executable_path = driver_path)
driver.get("https://www.baidu.com/")
inputTag =driver.find
from selenium import webdriver
driver_path = r'D:\tools\chromedriver.exe'
driver = webdriver.Chrome(executable_path = driver_path)
driver.get("https://www.baidu.com/")
#inputTag=driver.find_element_by_id('kw')
#inputTag=driver.find_element_by_name('wd')
inputTag = driver.find_element_by_css_selector(".quickdelete-wrap > input")[0]
inputTag.send_keys('python')
#如果只是解析网页中的数据,推荐将网页源代码扔给lxml(lxml底层用的c语言,解析效率更高)解析
#如果对元素进行输入、点击某个按钮等操作,则必须使用selenium
from selenium import webdriver
from selenium.webdriver.support.ui import Select
driver_path = r'D:\tools\chromedriver.exe'
driver = webdriver.Chrome(executable_path = driver_path)
driver.get("http://www.dobai.cn/")
#通过元素名称
selectBtn = Select(driver.find_element_by_name('jumpMenu'))
#通过索引
selectBtn.select_by_index(0)
#通过value属性
selectBtn.select_by_value("http://m.95xiu.com/")
#通过可见文本
selectBtn.select_by_visible_text("95秀客户端")
from selenium import webdriver
driver_path = r'D:\tools\chromedriver.exe'
driver = webdriver.Chrome(executable_path = driver_path)
driver.get("http://www.baidu.com/")
inputTag = driver.find_element_by_id('kw')
inputTag.send_keys('python')
submitTag = driver.find_element_by_id('su')
submitTag.click()
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
driver_path = r'D:\tools\chromedriver.exe'
driver = webdriver.Chrome(executable_path = driver_path)
driver.get("http://www.baidu.com/")
inputTag = driver.find_element_by_id('kw')
submitBtn = driver.find_element_by_id('su')
actions = ActionChains(driver)
actions.move_to_element(inputTag)
actions.send_keys_to_element(inputTag,'python')
actions.send_keys_to_element(submitBtn)
actions.click()
actions.perform()
from selenium import webdriver
driver_path = r'D:\tools\chromedriver.exe'
driver = webdriver.Chrome(executable_path = driver_path)
driver.get("http://www.baidu.com/")
#获取所有cookie
for cookie in driver.get_cookies():
print(cookie)
#根据cookie的key获取value
print(driver.get_cookie("PSTM"))
#删除所有cookie
driver.delete_all_cookies()
#删除某个cookie
driver.delete_cookie("PSTM")
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
driver_path = r'D:\tools\chromedriver.exe'
driver = webdriver.Chrome(executable_path = driver_path)
driver.get("http://www.douban.com/")
#先隐式等待10秒,如果没有id为hfsjfjkghfj的元素就抛出异常
driver.implictity_wait(10)
driver.find_element_by_id("hfsjfjkghfj")
#显式等待,元组的形式(By.ID,'form_email')
element = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.ID,'form_email')))
print(element)
from selenium import webdriver
driver_path=r'D:\tools\chromedriver.exe'
driver = webdriver.Chrome(executable_path=driver_path)
driver.get('https://www.baidu.com/')
#只是页面指向豆瓣,但实际url仍是百度
driver.execute_script("window.open('https://www.douban.com/')")
#做爬虫时实际指向索引为1(根据打开的顺序,百度为0)的豆瓣页面
driver.switch_to_window(driver.window_handles[1])
print(driver.current_url)#查看driver实际指向哪个窗口
print(driver.page_source)
from selenium import webdriver
driver_path = r'D:\tools\chromedriver.exe'
options = webdriver.ChromeOptions()
options.add_argument("--proxy-server=http://171.41.82.12:9999")
driver = webdriver.Chrome(executable_path =driver_path,chrome_options=options)
driver.get("http://httpbin.org/ip")
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 11 15:34:47 2019
@author: admin
"""
from selenium import webdriver
from lxml import etree
import re
import csv
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
class LagouSpider(object):
driver_path = r"D:\tools\chromedriver.exe"
def __init__(self):
self.driver = webdriver.Chrome(executable_path=LagouSpider.driver_path)
self.url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
self.positions=[]
fp=open('lagou.csv','a',newline='',encoding='utf-8')
self.writer=csv.DictWriter(fp,['name','salary','company_name','location','work_experience','education','job_desc'])
self.writer.writeheader()
def run(self):
self.driver.get(self.url)
while True:
source = self.driver.page_source
WebDriverWait(driver= self.driver,timeout=50).until(EC.presence_of_all_elements_located((By.XPATH,"//div[@class='pager_container']/span[last()]")))
self.parse_list_page(source)
try:
next_btn = self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]")
if "pager_next_disabled" in next_btn.get_attribute("class"):
break
else:
next_btn.click()
except:
print(source)
time.sleep(2)
def parse_list_page(self,source):
html = etree.HTML(source)
links = html.xpath("//a[@class='position_link']/@href")
for link in links:
self.request_detail_page(link)
time.sleep(2)
def request_detail_page(self,url):
self.driver.execute_script("window.open('%s')" %url)
self.driver.switch_to.window(self.driver.window_handles[1])
WebDriverWait(driver= self.driver,timeout=50).until(EC.presence_of_all_elements_located((By.XPATH,"//span[@class='name']")))
source = self.driver.page_source
self.parse_detail_page(source)
self.driver.close()
self.driver.switch_to.window(self.driver.window_handles[0])
def parse_detail_page(self,source):
html = etree.HTML(source)
position_name=html.xpath("//span[@class='name']/text()")[0]
job_request_spans = html.xpath("//dd[@class='job_request']//span")
salary = job_request_spans[0].xpath('.//text()')[0].strip()
city = job_request_spans[1].xpath(".//text()")[0].strip()
city = re.sub(r"[\s/]","",city)
work_years = job_request_spans[2].xpath(".//text()")[0].strip()
work_years = re.sub(r"[\s/]","",work_years)
desc ="".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
position={
'name':position_name,
'salary':salary,
'city':city,
'work_years':work_years,
'desc':desc
}
self.write_position(position)
def write_position(self,position):
self.writer.writerow(position)
print(position)
if __name__=='__main__':
spider = LagouSpider()
spider.run()
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 24 09:56:14 2019
@author: admin
"""
from lxml import etree
from selenium import webdriver
import time
import re
import csv
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
class BossSpider(object):
driver_path=r"D:\tools\chromedriver.exe"
def __init__(self):
self.driver=webdriver.Chrome(executable_path=BossSpider.driver_path)
#包含所有职位信息的页面,取Request URL的值
self.url="https://www.zhipin.com/job_detail/?query=python&city=100010000&industry=&position="
#容易忽略报WebDriverException: unknown error: unhandled inspector error:
#得到url后与网页中url比较是否缺前面的域名
self.domain="https://www.zhipin.com"
fp=open('boss.csv','a',newline='',encoding='utf-8')
self.writer=csv.DictWriter(fp,['name','salary','company_name','location','work_experience','education','job_desc'])
self.writer.writeheader()
def run(self):
self.driver.get(self.url)
while True:
#不同于右键查看源代码,可以看到所有职位信息
source=self.driver.page_source
WebDriverWait(driver= self.driver,timeout=50).until(EC.presence_of_all_elements_located((By.XPATH,"//a[contains(@class,'next')]")))
self.parse_list_page(source)
next_btn=self.driver.find_element_by_xpath("//a[contains(@class,'next')]")
#如果next_btn的class属性是next disabled
if "disabled" in next_btn.get_attribute("class"):
break
else:
next_btn.click()
time.sleep(1)
def parse_list_page(self,source):
'''
获取职位链接
'''
html=etree.HTML(source)
#获取详情页面的url,即一页30个职位
links=html.xpath("//div[@class='info-primary']//a[position()=1]/@href")
for link in links:
url=self.domain+link
self.request_detail_page(url)
time.sleep(1)
def request_detail_page(self,url):
self.driver.execute_script("window.open('%s')" %url)
self.driver.switch_to.window(self.driver.window_handles[1])
source = self.driver.page_source
self.parse_detail_page(source)
self.driver.close()
self.driver.switch_to.window(self.driver.window_handles[0])
def parse_detail_page(self,source):
html=etree.HTML(source)
position_name=html.xpath("//div[@class='info-primary']/div[@class='name']/h1/text()")[0]
salary=html.xpath("//div[@class='name']/span[@class='salary']/text()")[0]
salary=re.sub(r"[\s/]","",salary)
company_name=html.xpath("//a[@ka='job-detail-company']/text()")[1].strip()
location=html.xpath("//div[@class='info-primary']/p[position()=1]/text()")[0]
work_experience=html.xpath("//div[@class='info-primary']/p[position()=1]/text()")[1]
education=html.xpath("//div[@class='info-primary']/p[position()=1]/text()")[2]
job_desc="\n".join(html.xpath("//div[@class='detail-content']/div[@class='job-sec'][position()=1]/div[@class='text']/text()")).strip()
position={
'name':position_name,
'salary':salary,
'company_name':company_name,
'location':location,
'work_experience':work_experience,
'education':education,
'job_desc':job_desc
}
self.write_position(position)
def write_position(self,position):
self.writer.writerow(position)
print(position)
if __name__=='__main__':
spider=BossSpider()
spider.run()