Day03 爬取京东商品信息+元素交互操作+BeautifulSoup4

一、 先在京东搜索墨菲定律,然后对页面上的商信息进行爬取:

 1 from selenium import webdriver
 2 import time
 3 from selenium.webdriver.common.keys import Keys  # 键盘按键操作
 4 
 5 driver = webdriver.Chrome()
 6 num = 1
 7 try:
 8     driver.implicitly_wait(10)
 9     # 往京东发送请求
10     driver.get('https://www.jd.com/')
11     #往京东主页输入框输入墨菲定律按回车键
12     input_tag = driver.find_element_by_id('key')
13     input_tag.send_keys('墨菲定律')
14     input_tag.send_keys(Keys.ENTER)
15 
16     time.sleep(5)
17 
18     good_list = driver.find_elements_by_class_name('gl-item')
19     for good in good_list:
20         # print(good)
21 
22         good_name = good.find_element_by_css_selector('.p-name em').text
23         # print(good_name)
24 
25         #商品链接
26         good_url = good.find_element_by_css_selector('.p-name a').get_attribute('href')
27         # print(good_url)
28 
29         #商品价格
30         good_price = good.find_element_by_class_name('p-price').text
31         # print(good_price)
32 
33         #商品评价
34         good_commit = good.find_element_by_class_name('p-commit').text
35         # print(good_commit)
36 
37         good_content = f'''
38         num:{num}
39         商品名称:{good_name}
40         商品连接:{good_url}
41         商品价格:{good_price}
42         商品评价:{good_commit}
43         \n
44         '''
45         print(good_content)
46         with open('jd.txt','a',encoding='utf-8') as f:
47             f.write(good_content)
48         num+=1
49 
50     #找到下一页并点击
51     next_tag = driver.find_element_by_class_name('pn-next')
52     next_tag.click()
53 
54     time.sleep(10)
55 
56 finally:
57     driver.close()

然后我们对上面代码进行升级,使其能够对页面下拉,下一页从而爬取更多的商品信息:

将爬取信息的步骤写成一个递归函数进行调用

 1 from selenium import webdriver
 2 import time
 3 from selenium.webdriver.common.keys import Keys  # 键盘按键操作
 4 
 5 driver = webdriver.Chrome()
 6 
 7 def get_good(driver):
 8     num = 1
 9     try:
10         time.sleep(5)
11 
12         # 下拉滑动5000px
13         js_code = '''
14                window.scrollTo(0,5000)
15            '''
16         driver.execute_script(js_code)
17         # 等待5秒等待商品加载
18         time.sleep(5)
19 
20         good_list = driver.find_elements_by_class_name('gl-item')
21         for good in good_list:
22 
23             #商品名称
24             good_name = good.find_element_by_css_selector('.p-name em').text
25 
26             # 商品链接
27             good_url = good.find_element_by_css_selector('.p-name a').get_attribute('href')
28 
29             # 商品价格
30             good_price = good.find_element_by_class_name('p-price').text
31 
32             # 商品评价
33             good_commit = good.find_element_by_class_name('p-commit').text
34 
35             good_content = f'''
36             num:{num}
37             商品名称:{good_name}
38             商品连接:{good_url}
39             商品价格:{good_price}
40             商品评价:{good_commit}
41             \n
42             '''
43             print(good_content)
44             with open('jd.txt', 'a', encoding='utf-8') as f:
45                 f.write(good_content)
46             num += 1
47 
48         # 找到下一页并点击
49         next_tag = driver.find_element_by_class_name('pn-next')
50         next_tag.click()
51 
52         time.sleep(5)
53         #递归调用函数本身
54         get_good(driver)
55 
56     finally:
57         driver.close()
58 
59 if __name__ == '__main__':
60     driver = webdriver.Chrome()
61 
62     try:
63         driver.implicitly_wait(10)
64         # 往京东发送请求
65         driver.get('https://www.jd.com/')
66         # 往京东主页输入框输入墨菲定律按回车键
67         input_tag = driver.find_element_by_id('key')
68         input_tag.send_keys('墨菲定律')
69         input_tag.send_keys(Keys.ENTER)
70 
71         #调用获取商品信息函数
72         get_good(driver)
73     finally:
74         driver.close()

二、元素交互操作

 1.在京东先搜索‘围城’,在清空输入栏,输入'墨菲定律'进行搜索

 1 from selenium import webdriver
 2 from selenium.webdriver import ActionChains
 3 from selenium.webdriver.common.keys import Keys  # 键盘按键操作
 4 import time
 5 
 6 
 7 driver = webdriver.Chrome()
 8 
 9 try:
10     driver.implicitly_wait(10)
11     driver.get('https://www.jd.com/')
12     time.sleep(5)
13     #点击清除
14     input = driver.find_element_by_id('key')
15     input.send_keys('围城')
16 
17     #通过class查找搜索按钮
18     search = driver.find_element_by_class_name('button')
19     search.click()  #点击按钮搜索
20 
21     time.sleep(3)
22 
23     input2 = driver.find_element_by_id('key')
24     input2.clear()   #清空输入框
25 
26     time.sleep(1)
27 
28     input2.send_keys('墨菲定律')
29     input2.send_keys(Keys.ENTER)
30 
31     time.sleep(10)
32 
33 finally:
34     driver.close()
View Code

 2.将目标方块移动到目标方块中

 1 from selenium import webdriver
 2 from selenium.webdriver import ActionChains
 3 from selenium.webdriver.common.keys import Keys  # 键盘按键操作
 4 import time
 5 driver = webdriver.Chrome()
 6 
 7 try:
 8     driver.implicitly_wait(10)
 9     driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
10     time.sleep(5)
11 
12     #遗弃方法
13     # driver.switch_to_frame()
14     #新方法
15     driver.switch_to.frame('iframeResult')
16     time.sleep(1)
17 
18     #获取动作链对象
19     action = ActionChains(driver)
20     #启示方块id:draggable
21     source = driver.find_element_by_id('draggable')
22 
23     #目标方块id:droppable
24     target = driver.find_element_by_id('droggable')
25 
26     #方式一秒移
27     #起始方块瞬间移动到目标方块中
28     #拟定好一个动作,需要执行的方法perform
29     action.drag_and_drop(source,target).perform()
30 
31     time.sleep(10)
32 finally:
33     driver.close()
View Code

 3.将目标方块一步一步的移动到目标方块中

 1 from selenium import webdriver
 2 from selenium.webdriver import ActionChains
 3 from selenium.webdriver.common.keys import Keys  # 键盘按键操作
 4 import time
 5 
 6 driver = webdriver.Chrome()
 7 
 8 try:
 9     driver.implicitly_wait(10)
10     driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
11     time.sleep(5)
12 
13     #遗弃方法
14     # driver.switch_to_frame()
15     #新方法
16     driver.switch_to.frame('iframeResult')
17     time.sleep(1)
18 
19     #启示方块id:draggable
20     source = driver.find_element_by_id('draggable')
21 
22     #目标方块id:droppable
23     target = driver.find_element_by_id('droppable')
24 
25     print(source.size) #大小
26     print(source.text)  #文本
27     print(source.tag_name)  #标签名
28     print(source.location)  #坐标
29 
30     #找到滑动距离
31     distance = target.location['x']-source.location['x']
32 
33     #按住起始滑块
34     ActionChains(driver).click_and_hold(source).perform()
35     #方式二一点一点移
36     s=0
37     while s < distance:
38         #获取动作链对象
39         #每次移动距离
40         ActionChains(driver).move_by_offset(xoffset=2,yoffset=0).perform()
41         s+=2
42         time.sleep(0.1)
43 
44     #松开起始滑块
45     ActionChains(driver).release().perform()
46     time.sleep(10)
47 finally:
48     driver.close()
View Code

 4.目标网页执行js代码

 1 from selenium import webdriver
 2 from selenium.webdriver import ActionChains
 3 import time
 4 driver = webdriver.Chrome()
 5 try:
 6     driver.implicitly_wait(10)
 7 
 8     driver.get('https://www.baidu.com/')
 9     driver.execute_script(
10         'alert("你好呀!!!")'
11     )
12     time.sleep(10)
13 finally:
14     driver.close()
View Code

 5.模拟浏览器的前进后退

 1 from selenium import webdriver
 2 import time
 3 driver = webdriver.Chrome()
 4 try:
 5     driver.implicitly_wait(10)
 6 
 7     driver.get('https://www.baidu.com/')
 8     driver.get('https://www.taobao.com/')
 9     driver.get('https://www.sina.com.cn/')
10     #回退
11     driver.back()
12     time.sleep(5)
13     #前进
14     driver.forward()
15     time.sleep(3)
16 finally:
17     driver.close()
View Code

 

三、BeautifulSoup4

 1.bs4的安装与使用

 1 '''
 2 安装解析器:
 3     pip3 install lxml
 4 安装解析库:
 5     pip3 install bs4
 6 
 7 '''
 8 html_doc ="""
 9 The Dormouse's story
10 
11 

$37

12 13

Once upon a time there were three little sisters; and their names were 14 Elsie, 15 Lacie and 16 Tillie; 17 and they lived at the bottom of a well.

18

...

19 """ 20 21 from bs4 import BeautifulSoup 22 23 #python自带的解析库 24 # soup = BeautifulSoup(html_doc,'html.parser') 25 26 #调用bs4得到一个soup对象 27 soup = BeautifulSoup(html_doc,'lxml') 28 29 #打印bs4对象 30 # print(soup) 31 #打印bs4类型 32 # print(type(soup)) 33 34 #美化功能 35 html=soup.prettify() 36 print(html)
View Code

 2.bs4之遍历文档树

 1 '''
 2 安装解析器:
 3     pip3 install lxml
 4 安装解析库:
 5     pip3 install bs4
 6 
 7 '''
 8 html_doc ="""
 9 The Dormouse's story
10 
11 

$37

12 13

Once upon a time there were three little sisters; and their names were 14 Elsie, 15 Lacie and 16 Tillie; 17 and they lived at the bottom of a well.

18

...

19 """ 20 21 from bs4 import BeautifulSoup 22 23 soup = BeautifulSoup(html_doc,'lxml') 24 25 #遍历文档树 26 # 1、直接使用 ***** 27 print(soup.html) 28 print(type(soup.html)) 29 print(soup.a) 30 print(soup.p) 31 32 # 2、获取标签的名称 33 print(soup.a.name) 34 35 # 3、获取标签的属性 ***** 36 print(soup.a.attrs) #获取a标签内的所有属性 37 print(soup.a.attrs['href']) 38 39 # 4、获取标签文本的内容 ***** 40 print(soup.p.text) # $37 41 42 # 5、嵌套选择 43 print(soup.html.body.p) 44 45 # 6、子节点、子孙节点 46 print(soup.p.children) #返回迭代器对象 47 print(list(soup.p.children)) #[$37] 48 49 # 7、父节点、祖先节点 50 print(soup.b.parent) 51 print(soup.b.parents) 52 print(list(soup.b.parents)) 53 54 55 # 8、兄弟节点 (sibling: 兄弟姐妹) 56 print(soup.a) 57 # 获取下一个兄弟节点 58 print(soup.a.next_sibling) 59 # 获取下一个的所有兄弟节点,返回的是一个生成器 60 print(soup.a.next_siblings) 61 print(list(soup.a.next_siblings)) 62 # 63 # 获取上一个兄弟节点 64 print(soup.a.previous_sibling) 65 # 获取上一个的所有兄弟节点,返回的是一个生成器 66 print(list(soup.a.previous_siblings))
View Code

 3.bs4之遍历搜索树

  1 '''
  2 find:找第一个
  3 find_all:找所有
  4 标签查找与属性查找:
  5     name属性
  6             name 标签名
  7             attrs 属性查找匹配
  8             text 文本匹配
  9 
 10     标签:
 11         - 字符串过滤器   字符串全局匹配
 12 
 13         - 正则过滤器
 14             re模块匹配
 15 
 16         - 列表过滤器
 17             列表内的数据匹配
 18 
 19         - bool过滤器
 20             True匹配
 21 
 22         - 方法过滤器
 23             用于一些要的属性以及不需要的属性查找。
 24 
 25     属性:
 26         - class_
 27         - id
 28 '''
 29 
 30 
 31 html_doc = """
 32 The Dormouse's story

$37

Once upon a time there were three little sisters; and their names wereElsieLacie andTillieand they lived at the bottom of a well.

...

33 """ 34 35 from bs4 import BeautifulSoup 36 37 soup = BeautifulSoup(html_doc,'lxml') 38 39 #name 标签名 40 # attrs 属性查找匹配 41 # text 文本匹配 42 #find与find_all搜索文档 43 44 ''' 45 字符串过滤器 46 ''' 47 p = soup.find(name='p') 48 p_s = soup.find_all(name='p') 49 print(p) 50 print(p_s) 51 52 #name+attrs 53 p = soup.find(name='p',attrs={"id":"p"}) 54 print(p) 55 56 #name+text 57 p = soup.find(name='title',text="The Dormouse's story") 58 print(p) 59 60 #name+attrs+text 61 tag = soup.find(name='a',attrs={"class":"sister"},text='Elsie') 62 print(tag) 63 64 65 ''' 66 -正则过滤器 67 re模块匹配 68 ''' 69 import re 70 #name 71 #根据re模块匹配带有a的节点 72 a = soup.find(name=re.compile('a')) 73 a_s = soup.find_all(name=re.compile('a')) 74 print(a) 75 print(a_s) 76 77 #attrs 78 a = soup.find(attrs={"id":re.compile('link')}) 79 print(a) 80 81 82 #列表过滤器 83 #列表内数据匹配 84 print(soup.find(name=['a','p','html',re.compile('a')])) 85 print(soup.find_all(name=['a','p','html',re.compile('a')])) 86 87 # bool过滤器 88 #True匹配 89 print(soup.find(name=True,attrs={"id":True})) 90 91 #方法过滤器 92 #用于一些要的属性以及不需要的属性查找 93 def have_id_not_class(tag): 94 if tag.name == 'p' and tag.has_attr("id") and not tag.has_attr("class"): 95 return tag 96 print(soup.find_all(name=函数对象) 97 print(soup.find_all(name=have_id_not_class)) 98 99 #补充说明: 100 #id 101 a = soup.find(id="link2") 102 print(a) 103 104 #class 105 p = soup.find(class_='sister') 106 print(p)
View Code

 

转载于:https://www.cnblogs.com/tanknb/p/11129164.html

你可能感兴趣的:(Day03 爬取京东商品信息+元素交互操作+BeautifulSoup4)