爬虫--requests、BeautifulSoup入门

1、通过关键字进行搜索

import requests
#百度关键词搜索API: https://www.baidu.com/s?wd=keyword
kv = {'wd':'Python'}
header = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'} 
#添加Accept跳过百度验证
r = requests.get("https://www.baidu.com/s",headers = header,params = kv)
print(r.request.url)
print(r.request.headers)
print(r.status_code,r.encoding,len(r.text))
r.encoding = 'utf-8'
print(r.text)

2、爬取网络图片将其保存到本地

import os
import requests
url = "https://www.zhifure.com/upload/images/2018/7/13181228598.jpg"   #图片地址
root = "C://Users//123//Desktop//新建文件夹//" #保存的根目录
path = root + url.split('/')[-1]  #保存的地址

try:
    if not os.path.exists(root): #判断根目录是否存在
        os.mkdir(root)
    if not os.path.exists(path):  #如果文件不存在就爬取并保存
        r = requests.get(url)
        with open(path,'wb') as f: #'wb'以二进制格式打开一个文件只用于写入。如果该文件已存在则将其覆盖。如果该文件不存在,创建新文件。
            f.write(r.content)     #content返回二进制数据,所以使用'wb'
            f.close()
            print("文件保存成功")
    else:
        print("文件已存在")
except:
    print("爬取失败")

爬虫--requests、BeautifulSoup入门_第1张图片
爬虫--requests、BeautifulSoup入门_第2张图片

from bs4 import BeautifulSoup
import requests

r = requests.get("http://python123.io/ws/demo.html")
#print(r.text)
demo = r.text  #获取网站的返回文本
soup = BeautifulSoup(demo,"html.parser") #使用bs4的HTML解析器"html.parser"解析demo的内容
print(soup.prettify()) #输出解析后的结果友好输出.prettify()
tag = soup.title  #soup.形式输出指定标签中的全部内容
print(tag) #输出title标签
print(tag.parent.name,tag.parent.parent.name) #name显示标签,parent便签的父亲

tag2 = soup.a
print(tag2.attrs) #attrs显示标签的属性
print(tag2.attrs['class']) #获取标签属性中的class属性的值
print(tag2.attrs['href'])	#获取标签属性中的href属性的值
print(type(tag2.attrs)) #标签类型
print(type(tag2))
print(soup.a.string) #获取标签内的字符串内容(可以跨越标签)
print(soup.p.string)
#运行结果
#
#  
#   </span>
<span class="token comment">#    This is a python demo page</span>
<span class="token comment">#   
#  
#  
#   

# # The demo python introduces several python courses. # #

#

# Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses: # # Basic Python # # and # # Advanced Python # # . #

# # # # This is a python demo page # head html # {'href': 'http://www.icourse163.org/course/BIT-268001', 'class': ['py1'], 'id': 'link1'} # ['py1'] # http://www.icourse163.org/course/BIT-268001 # # # Basic Python # The demo python introduces several python courses. # # Process finished with exit code 0

爬虫--requests、BeautifulSoup入门_第3张图片
下行遍历
爬虫--requests、BeautifulSoup入门_第4张图片

import requests
from bs4 import Beautifulsoup
r = requests.get("http://python123.io/ws/demo.html")
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
#print(soup.prettify())
print(soup.head)
print(soup.head.contents)
print(soup.body.contents)
print(len(soup.body.contents)) #获取body标签中的儿子标签的个数
print(soup.body.contents[0])   #通过下标访问body标签中的儿子标签

for child in soup.body.children:  #遍历儿子结点
    print(child)

#This is a python demo page
# [This is a python demo page]
# ['\n', 

The demo python introduces several python courses.

, '\n',

Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses: # Basic Python and Advanced Python.

, '\n']
# 5 # # # # #

The demo python introduces several python courses.

# # #

Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses: # Basic Python and Advanced Python.

# # # # Process finished with exit code 0

上行遍历
爬虫--requests、BeautifulSoup入门_第5张图片

r = requests.get("http://python123.io/ws/demo.html")
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
print(soup.title.parent)
print(soup.html.parent)
print(soup.parent) #空

for parent in soup.a.parents:
    if parent is None:
        print(parent)
    else:
        print(parent.name)

平行遍历
爬虫--requests、BeautifulSoup入门_第6张图片
爬虫--requests、BeautifulSoup入门_第7张图片

r = requests.get("http://python123.io/ws/demo.html")
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
print(soup.a.next_sibling) #next_sibling返回按照HTML文本顺序的下一个平行节点标签
print(soup.a.next_sibling.next_sibling)
print(soup.a.previous_sibling)  #next_sibling返回按照HTML文本顺序的上一个平行节点标签
print(soup.a.previous_sibling.previous_sibling)

基于bs4的相关查找

爬虫--requests、BeautifulSoup入门_第8张图片

import requests
import re
from bs4 import BeautifulSoup

r = requests.get("http://python123.io/ws/demo.html")
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
# print(soup.a.next_sibling) #next_sibling返回按照HTML文本顺序的下一个平行节点标签
# print(soup.a.next_sibling.next_sibling)
# print(soup.a.previous_sibling)  #next_sibling返回按照HTML文本顺序的上一个平行节点标签
# print(soup.a.previous_sibling.previous_sibling)
print(soup.find_all(['a','b'])) #查询a和b标签
for link in soup.find_all('a'): #查找所有的a标签
    print(link)
for tag in soup.find_all(True):
    print(tag.name)

for tag in soup.find_all(re.compile('b')): #使用正则表达式查找b开头的标签
    print(tag.name)

print(soup.find_all('p','course')) #查找标签中属性值为course字符串的的p标签

print(soup.find_all(id = 'link1')) #查找id='link1'的标签元素

print(soup.find_all(id = re.compile('link')))#使用正则表达式查找id值为link开头的标签信息

print(soup.find_all(string = 'Basic Python')) #string参数用于检索<>...中字符串区域的检索字符串

print(soup.find_all(string = re.compile('Python'))) #使用正则表达式以及string参数检索所有含有Python字符串的字符串区域

print(soup('a') == soup.find_all('a'))  #True

`爬虫--requests、BeautifulSoup入门_第9张图片
爬虫--requests、BeautifulSoup入门_第10张图片

你可能感兴趣的:(无脑的Python笔记)