python网络爬虫进阶day01

  2019.5.23,今天学习了selenium+chromedriver获取动态数据:

selenium相当于是一个机器人一样,可以模拟人的行为,例如:点击,填充数据,翻页等。chromedriver是驱动Chrome浏览器的一个驱动程序。必须通过chromedriver才能得到一个driver,爬虫才能模拟人的行为。

  1 # Author:K
  2 from selenium import webdriver
  3 import time
  4 driver_path = r'D:\ChromeDriver\chromedriver.exe'
  5 
  6 driver = webdriver.Chrome(executable_path = driver_path)
  7 
  8 url = 'https://www.baidu.com/'
  9 driver.get(url)
 10 
 11 # 用类名查找的方式
 12 inputTag = driver.find_element_by_class_name('s_ipt')
 13 inputTag.send_keys('python')
 14 
 15 ################################################## 测试点击按钮 ##################################################
 16 submitTag = driver.find_element_by_id('su')
 17 submitTag.click()
 18 
 19 ################################################### 测试checkbox ###################################################
 20 url = 'http://www.renren.com/'
 21 driver.get(url)
 22 autoLoginBtn = driver.find_element_by_id('autoLogin')
 23 autoLoginBtn.click()
 24 
 25 # 测试select下拉菜单 由于没找到对应的网站,找到后将xxx写成相应数据即可
 26 from selenium.webdriver.support.ui import Select
 27 url = 'xxx'
 28 select = Select(driver.find_element_by_id('xxx'))  # 要用Select修饰一下
 29 select.select_by_value('xxx')
 30 
 31 ################################################### 行为链测试 ###################################################
 32 from selenium.webdriver.common.action_chains import ActionChains
 33 
 34 driver = webdriver.Chrome(executable_path = driver_path)
 35 
 36 url = 'https://www.baidu.com/'
 37 driver.get(url)
 38 
 39 inputTag = driver.find_element_by_id('kw')
 40 print(inputTag)  # !!!!!!!!!!为什么这里打印的是元素,下面测试WebElement打印的是列表?!!!!因为这里写的是element
 41 summitBtn = driver.find_elements_by_id('su')[0]  # 返回一个button的时候是列表
 42 print(summitBtn)
 43 
 44 actions = ActionChains(driver)
 45 actions.move_to_element(inputTag)
 46 actions.send_keys_to_element(inputTag,'python')
 47 actions.move_to_element(summitBtn)
 48 actions.click(summitBtn)
 49 actions.perform()  # 为什么会出错????
 50 
 51 ################################################### 测试 cookie ###################################################
 52 url = 'https://www.baidu.com/'
 53 driver.get(url)
 54 for cookie in driver.get_cookies():
 55     print(cookie)
 56 
 57 cookie = driver.get_cookie('BDORZ')
 58 print(cookie)
 59 
 60 driver.delete_all_cookies()  # 删除所有cookie
 61 
 62 ################################################### 测试页面等待 ###################################################
 63 from selenium.webdriver.support.ui import WebDriverWait
 64 from selenium.webdriver.support import expected_conditions as EC
 65 from selenium.webdriver.common.by import By
 66 
 67 url = 'https://www.baidu.com'
 68 driver.get(url)
 69 
 70 '''
 71 隐式等待:创建driver时就创建一个最长等待时间,得不到元素就一直等直到超时
 72 (弊端:要等待整个页面加载完成,那些不需要用到的元素也必须加载出来才算完成)
 73 '''
 74 driver.implicitly_wait(5)
 75 
 76 '''
 77 显式等待:等5秒,条件满足就执行,否则等到时间结束
 78 '''
 79 element = WebDriverWait(driver,5).until(
 80     EC.presence_of_element_located((By.ID,'su'))
 81 )
 82 
 83 print(element)
 84 
 85 time.sleep(10)
 86 driver.close()
 87 
 88 ################################################### 测试页面切换 ###################################################
 89 
 90 url = 'https://www.baidu.com'
 91 driver.get(url)
 92 driver.execute_script('window.open("http://www.renren.com/")')
 93 print(driver.current_url)
 94 print(driver.window_handles)  # 打印浏览器中网页的句柄
 95 driver.switch_to.window(driver.window_handles[1])  # 将driver转到句柄列表为1的窗口下
 96 print(driver.current_url)
 97 
 98 ################################################### 测试代理 ###################################################
 99 
100 url = 'http://httpbin.org/ip'
101 
102 options = webdriver.ChromeOptions()
103 options.add_argument('--proxy-server=http://120.234.63.196:3128')
104 
105 driver = webdriver.Chrome(executable_path = driver_path,chrome_options = options)
106 driver.get(url)
107 
108 ################################################### 测试WebElement ###################################################
109 url = 'https://www.baidu.com'
110 driver.get(url)
111 summitBtn = driver.find_elements_by_id('kw')  #find返回的是一个列表
112 print(summitBtn)  # !!!!!!!!!为什么这里打印的是列表,上面测试行为链打印的是元素?!!!!因为这里写的是elements
113 print(summitBtn.get_attribute('class'))
114 
115 time.sleep(10)
116 
117 driver.quit()
selenium_demo  

 

  另外,今天做了一个小案例,爬取“乌托家”的家具公司的数据,代码如下:

 1 # Author:K
 2 import requests
 3 from lxml import etree
 4 import os
 5 
 6 HEADERS = {
 7     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
 8 }
 9 
10 def parse_page(url):
11     response = requests.get(url=url, headers=HEADERS)
12     page_text = response.text
13     tree = etree.HTML(page_text)
14     li_list = tree.xpath('//ul[@class="rec-commodity-ul targetElement"]/li')
15     for li in li_list:
16         merchant_href = li.xpath('.//div[@class="impression"]/a/@href')[0]
17         merchant_name = li.xpath('.//div[@class="impression"]/a/text()')[0]
18         commodity_name = li.xpath('.//div[@class="material"]/a/text()')[0]
19         # print(merchant_href,merchant_name,commodity_name)
20         detail_page_text = requests.get(url=merchant_href, headers=HEADERS).text
21         tree = etree.HTML(detail_page_text)
22         div_infos = tree.xpath('//div[@class="brand-r"]')
23         for div in div_infos:
24             try:
25                 brand_name = div.xpath('./div[4]/dl/dd/text()')[0]
26                 addr = div.xpath('.//p/text()')[0]
27                 phone = div.xpath('.//dd[2]/text()')[0]
28                 # print(brand_name, addr, phone)
29 
30                 # 持久化存储
31                 file_path = 'H:/乌托家/乌托家家具公司.txt'
32                 fp = open(file_path, 'r+', encoding='utf-8')
33                 if brand_name not in fp.read():
34                     if str(addr).__contains__('广东'):
35                         fp.write(brand_name+'   '+addr+'    '+phone+'\n\n')
36                         print(brand_name,'爬取成功!!!')
37                         fp.close()
38             except Exception as e:
39                 print(e)
40 
41 
42 def get_page():
43     for page in range(1,413):
44         url = 'http://www.wutuojia.com/item/list.html?page=' + str(page)
45         parse_page(url)
46 
47 
48 
49 def main():
50     get_page()
51 
52 
53 if __name__ == '__main__':
54     # 持久化存储
55     if not os.path.exists('H:/乌托家'):
56         os.mkdir('H:/乌托家')
57     main()
乌托家数据爬取

  划重点了!写XPATH的时候尽量避免class的值有空格的标签!

 

  补充:2019.5.24,今天分别用requests和selenium爬拉勾网,遇到不少问题:

    首先是用resquests爬取如果从首页进去则不用获取JSON数据了,所以不用以下参数 ;如果搜索的话就要获取json数据

    其次是用selenium爬取,为什么只能提取两页,第三页就要登录?而且为什么第二页和第一页的信息一样?

 1 # Author:K
 2 import requests
 3 from lxml import etree
 4 import re
 5 import time
 6 import csv
 7 
 8 HEADERS = {
 9     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36',
10     # 如果从首页进去则不用获取JSON数据了,所以不用以下参数 ;如果搜索的话就要获取json数据
11     # 'Referer':'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
12     # 'Cookie':'JSESSIONID=ABAAABAAAGFABEF23F973C7DA9EFCF4CFE88AB8D87FB58E; _ga=GA1.2.1328313976.1558606630; _gid=GA1.2.1095083654.1558606630; user_trace_token=20190523181726-f6923953-7d43-11e9-a6cf-525400f775ce; LGUID=20190523181726-f6923dac-7d43-11e9-a6cf-525400f775ce; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1558606631; index_location_city=%E5%85%A8%E5%9B%BD; LGSID=20190523223505-f4d6a980-7d67-11e9-a6d0-525400f775ce; TG-TRACK-CODE=index_navigation; _gat=1; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1558624890; X_MIDDLE_TOKEN=9b65220a766ed951ca1a7dbd899dc36b; LGRID=20190523232554-0e6dd7d5-7d6f-11e9-a11a-5254005c3644; X_HTTP_TOKEN=33709e756aaf682260252685512be02aea6a03057c; SEARCH_ID=c0550a07c12045b4879a92a90883b79b',
13 }
14 
15 
16 # 如果从首页进去则不用获取JSON数据了,所以不用以下参数 ;如果搜索的话就要获取json数据
17 # DATA = {
18 #     'first':'false',
19 #     'pn':'1',
20 #     'kd':'python',
21 # }
22 
23 def data_visualization(data):
24     '''
25     数据可视化
26     :param data:
27     :return:
28     '''
29     # headers = ['job_name','salary','place','experience','education','job_detail','work_addr']
30     with open('requests_lagou.csv','a+',encoding = 'utf_8_sig',newline = '') as fp:
31         writer = csv.writer(fp)
32         writer.writerow(data)
33 
34 
35 
36 def parse_page(urls):
37     '''
38     解析页面
39     :param urls:
40     :return:
41     '''
42     for url in urls:
43         response = requests.get(url = url,headers = HEADERS,proxies = {'https':'120.234.63.196:3128'})
44         detail_page_text = response.text
45         tree = etree.HTML(detail_page_text)
46 
47         data = []
48         job_name = tree.xpath('//div[@class="job-name"]/@title')[0]
49         data.append(job_name)
50         salary = tree.xpath('//span[@class="salary"]/text()')[0]
51         data.append(salary)
52         place = tree.xpath('//dd[@class="job_request"]/p[1]/span[2]/text()')[0]
53         place = re.sub(r'[\s/]', '', place)  # 将空格和/去掉
54         data.append(place)
55         experience = tree.xpath('//dd[@class="job_request"]/p[1]/span[3]/text()')[0]
56         experience = re.sub(r'[\s/]', '', experience)  # 将空格和/去掉
57         data.append(experience)
58         education = tree.xpath('//dd[@class="job_request"]/p[1]/span[4]/text()')[0]
59         education = re.sub(r'[\s/]', '', education)  # 将空格和/去掉
60         data.append(education)
61         job_detail = ''.join(tree.xpath('//div[@class="job-detail"]//text()')).strip()
62         # data.append(job_detail)
63         work_addr = ''.join(tree.xpath('//div[@class="work_addr"]//text()')).strip()
64         work_addr = re.sub(r'[\s(查看地图)]', '', work_addr)
65         data.append(work_addr)
66 
67         # print(job_name,salary,place,experience,education)
68         # print(job_detail)
69         # print(work_addr)
70         # print(data)
71         # 数据可视化
72         data_visualization(data)
73         print(job_name,'爬取成功!!!')
74         time.sleep(5)
75 
76 
77 
78 def get_page():
79     '''
80     得到起始页面
81     :return:
82     '''
83     # 如果从首页进去则不用获取JSON数据了,所以不用以下参数 ;如果搜索的话就要获取json数据
84     # url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
85     url = 'https://www.lagou.com/zhaopin/Python/1/?filterOption=1'
86     response = requests.post(url = url,headers = HEADERS,proxies = {'https':'120.234.63.196:3128'})
87     page_text = response.text
88     tree = etree.HTML(page_text)
89     detail_urls = tree.xpath('//a[@class="position_link"]/@href')  # 得到详情页的url列表
90     parse_page(detail_urls)  # 解析详情页
91 
92 
93 def main():
94     get_page()
95 
96 
97 if __name__ == '__main__':
98     main()
requests爬取拉勾网

 

  1 # Author:K
  2 # ------------------------------------为什么只能提取两页,第三页就要登录?而且为什么第二页和第一页的信息一样?
  3 from selenium import webdriver
  4 from lxml import etree
  5 import re
  6 from selenium.webdriver.support.ui import WebDriverWait
  7 from selenium.webdriver.support import expected_conditions as EC
  8 from selenium.webdriver.common.by import By
  9 
 10 class LagouSpider(object):
 11     driver_path = r'D:\ChromeDriver\chromedriver.exe'
 12 
 13     def __init__(self):
 14         # self.options = webdriver.ChromeOptions()
 15         # self.options.add_argument('--proxy-server=http://163.204.247.107:9999')
 16         self.driver = webdriver.Chrome(executable_path = self.__class__.driver_path)  # chrome_options = self.options
 17         self.url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
 18 
 19 
 20     def run(self):
 21         '''
 22         执行函数
 23         :return:
 24         '''
 25         self.get_page()
 26 
 27 
 28     def get_page(self):
 29         '''
 30         得到详情页面的源码,将源码作为参数传递给解析页面函数
 31         :return:
 32         '''
 33         self.driver.get(self.url)  # 进入起始页
 34 
 35         self.page = 0
 36         while True:
 37             tree = etree.HTML(self.driver.page_source)
 38             # aTag_list = self.driver.find_elements_by_class_name('position_link')
 39             detail_url_list = tree.xpath('//a[@class="position_link"]/@href')  # 拿到详情页面的url_list
 40             # print(detail_url_list)  # 测试
 41             for detail_url in detail_url_list:
 42                 # print(detail_url)  # 测试
 43                 self.driver.execute_script('window.open("%s")' % detail_url)  # 利用JS代码打开新的窗口
 44                 self.driver.switch_to.window(self.driver.window_handles[1])  # 切换到第二个页面中
 45                 source = self.driver.page_source  # 拿到详情页面的源码
 46                 # print(source)  # 测试
 47 
 48                 # 在解析页面之前,先等一会儿,确保页面正常打开再解析
 49                 # WebDriverWait(self.driver,10).until(
 50                 #     EC.presence_of_element_located((By.XPATH,'//div[@class="job-name"]/@title'))
 51                 # )
 52 
 53                 self.parse_page(source)
 54                 self.driver.close()  # 关闭当前详情页
 55                 self.driver.switch_to.window(self.driver.window_handles[0])  #  切换回起始页
 56                 # break  # 为了测试,因为循环太多遍不好测试
 57             self.page += 1
 58             print('===============page %s was over===============' % self.page)
 59 
 60             # 等10秒,直到拿到下一页按钮为止
 61             WebDriverWait(self.driver, 10).until(
 62                 EC.presence_of_element_located((By.XPATH, '//div[@class="pager_container"]/span[last()]'))
 63             )
 64 
 65             next_button = self.driver.find_elements_by_xpath('//div[@class="pager_container"]/span[last()]')[0]  # 拿到下一页按钮
 66             print(next_button.get_attribute('class'))  # 测试
 67             if next_button.get_attribute('class') == 'pager_next pager_next_disabled':  # 循环结束条件。因为最后一页的下一页按钮按了没有
 68                 break
 69             else:
 70                 next_button.click()
 71 
 72 
 73 
 74 
 75     def parse_page(self,source):
 76         '''
 77         解析详情页面
 78         :param source:
 79         :return:
 80         '''
 81 
 82         tree = etree.HTML(source)
 83 
 84         data = []
 85         job_name = tree.xpath('//div[@class="job-name"]/@title')[0]
 86         data.append(job_name)
 87         salary = tree.xpath('//span[@class="salary"]/text()')[0]
 88         data.append(salary)
 89         place = tree.xpath('//dd[@class="job_request"]/p[1]/span[2]/text()')[0]
 90         place = re.sub(r'[\s/]', '', place)  # 将空格和/去掉
 91         data.append(place)
 92         experience = tree.xpath('//dd[@class="job_request"]/p[1]/span[3]/text()')[0]
 93         experience = re.sub(r'[\s/]', '', experience)  # 将空格和/去掉
 94         data.append(experience)
 95         education = tree.xpath('//dd[@class="job_request"]/p[1]/span[4]/text()')[0]
 96         education = re.sub(r'[\s/]', '', education)  # 将空格和/去掉
 97         data.append(education)
 98         job_detail = ''.join(tree.xpath('//div[@class="job-detail"]//text()')).strip()
 99         data.append(job_detail)
100         work_addr = ''.join(tree.xpath('//div[@class="work_addr"]//text()')).strip()
101         work_addr = re.sub(r'[\s(查看地图)]', '', work_addr)
102         data.append(work_addr)
103 
104         print(data)
105 
106 
107 if __name__ == '__main__':
108     spider = LagouSpider()
109     spider.run()
selenium爬取拉勾网

 

  这个案例的问题:爬到第三页的时候就弹出登录页面

 python网络爬虫进阶day01_第1张图片

  错误是:

    job_name = tree.xpath('//div[@class="job-name"]/@title')[0]
    IndexError: list index out of range

 

 

  另外,还用selenium做了一个小案例,就是爬取多多看书的小说。最多只能爬前几十章,不知道为什么到后面就不行了。

 1 # Author:K
 2 from selenium import webdriver
 3 from lxml import etree
 4 import re
 5 import time
 6 from selenium.webdriver.support.ui import WebDriverWait
 7 from selenium.webdriver.support import expected_conditions as EC
 8 from selenium.webdriver.common.by import By
 9 
10 driver_path = r'D:\ChromeDriver\chromedriver.exe'
11 
12 driver = webdriver.Chrome(executable_path = driver_path)
13 
14 
15 def parse_page(source):
16         # WebDriverWait(driver,20).until(
17         #     EC.presence_of_element_located((By.XPATH,'//div[@id="contentWp"]//text()'))
18         # )
19         driver.implicitly_wait(100)
20         name = re.findall(r"bname = '(.*?)'", source)[0]  # 获取小说名字
21         chapter = re.findall(r'

(第.*?)

',source)[0] 22 # print(name,chapter) # 测试 23 tree = etree.HTML(source) 24 content = ''.join(tree.xpath('//div[@id="contentWp"]//text()')).strip() 25 file_path = 'H:/多多看书/' + name + '/' + chapter + '.txt' 26 27 with open(file_path,'w',encoding = 'utf-8') as fp: 28 fp.write(content) 29 30 aTag_button = driver.find_elements_by_class_name('next')[0] 31 32 33 if aTag_button: 34 aTag_button.click() 35 if len(driver.window_handles) > 1: 36 driver.switch_to.window(driver.window_handles[1]) 37 driver.close() 38 driver.switch_to.window(driver.window_handles[0]) 39 time.sleep(3) 40 new_source = driver.page_source 41 return parse_page(new_source) 42 43 44 45 def get_page(url): 46 driver.get(url) 47 close_button = driver.find_element_by_class_name('close') 48 close_button.click() 49 source = driver.page_source 50 # print(source) # 测试 51 parse_page(source) 52 53 54 55 56 def main(): 57 url = 'https://xs.sogou.com/chapter/4579119319_150046829984508/' 58 get_page(url) 59 60 61 if __name__ == '__main__': 62 main()
selenium爬取多多看书

 

 

  这个案例的问题:

selenium.common.exceptions.WebDriverException: Message: unknown error: Element is not clickable at point (874, 515). Other element would receive the click:
(Session info: chrome=74.0.3729.108)
(Driver info: chromedriver=2.46.628402 (536cd7adbad73a3783fdc2cab92ab2ba7ec361e1),platform=Windows NT 10.0.17134 x86_64)

 

  以后弄懂了回来补充!!!

 

 

 

 

 

 

 

 

 

 

 

 

 

 

转载于:https://www.cnblogs.com/KisInfinite/p/10915625.html

你可能感兴趣的:(python网络爬虫进阶day01)