sudo apt-get install python-bs4
sudo easy_install pip
pip install beautifulsoup4
pip install beautifulsoup4 # python2环境
# 或者是
pip3 install beautifulsoup4 # python3环境
python # 检查Python是否安装成功
from urllib.request import urlopen # 检查urllib模块是否存在
from bs4 import BeautifulSoup # 检查bs4模块是否存在
如下图所示:
三条命令都未出现错误提示说明环境已经准备好了。
# 1. 导入urllib库的request模块
from urllib import request
# 2. 请求URL
resp = request.urlopen('http://www.baidu.com')
# 3. 使用响应对象输出数据
print(resp.read().decode("utf-8"))
from bs4 import BeautifulSoup # 导入BeautifulSoup模块
from urllib import request # 导入urllib.request的urlopen模块
url = "http://www.baidu.com/"
resp = request.urlopen(url)
print(resp.read().decode("utf-8"))
from urllib import request
url = "http://www.baidu.com"
key = "User-Agent"
value = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.42"
req = request.Request(url)
req.add_header(key, value)
resp = request.urlopen(req)
print(resp.read().decode("utf-8"))
# 1. 导入urllib库下面的parse
from urllib import parse
# 2. 使用urlencode生成post数据
postData = parse.urlencode([
(key1, val1),
(key2, val2),
(keyn, valn)
])
# 3. 使用postData发送post请求
request.urlopen(req, data=postDate.encode('utf-8'))
# 4. 得到请求状态
resp.status
# 5. 得到服务器的类型
resp.reason
from urllib import request
from urllib.request import urlopen
from urllib import parse
url = "https://m.thsrc.com.tw/TimeTable/Search"
headers = {
'User-Agent':'Mozilla/5.0 (X11; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
# # 代理IP,由快代理提供
# proxy = '124.94.203.122:20993'
# proxy_values = "%(ip)s" % {'ip': proxy}
# proxies = {"http": proxy_values, "https": proxy_values}
#
# # 设置代理
# handler = request.ProxyHandler(proxies)
# opener = request.build_opener(handler)
data = {
"SearchType": "S",
"Lang": "TW",
"StartStation": "NanGang",
"EndStation": "ZuoYing",
"OutWardSearchDate": "2022/10/18",
"OutWardSearchTime": "14:30",
"ReturnSearchDate": "2022/10/18",
"ReturnSearchTime": "14:30",
"DiscountType": ""
}
data = parse.urlencode(data).encode("utf8") # 对参数进行编码
req = request.Request(url=url, data=data, headers=headers, method="POST") # 请求处理
resp = request.urlopen(req)
# resp = opener.open(req).read() # 使用代理用这种方式请求
print(resp.read().decode("utf-8"))
出现拒绝访问时参考文章:https://blog.csdn.net/kdl_csdn/article/details/103989024
运行效果:
import requests
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.42"
}
url = "https://m.thsrc.com.tw/TimeTable/Search"
params = {
"SearchType": "S",
"Lang": "TW",
"StartStation": "NanGang",
"EndStation": "ZuoYing",
"OutWardSearchDate": '2022/10/18',
"OutWardSearchTime": "14:00",
"ReturnSearchDate": "2022/10/18",
"ReturnSearchTime": "14:00",
"DiscountType": ""
}
resp = requests.post(url=url, headers=headers, params=params)
# print(resp.status_code) # 200
print(resp.text)
运行效果:
测试发送请求的爬虫工具:postman、fildder。
解析器 | 使用方法 | 优势 | 劣势 |
---|---|---|---|
Python标准库 | BeautifulSoup(markup, “html.parser”) | 1. Python的内置标准库; 2. 执行速度适中; 3. 文档容错能力强 |
Python 2.7.3 or(3.2.2)前的版本中文档容错能力差。 |
lxml HTML解析器 | BeautifulSoup(markup, “lxml”) | 1. 速度快; 2. 文档容错能力强。 |
需要安装C语言库。 |
lxml XML解析器 | BeautifulSoup(markup, [“lxml”, “xml”]) BeautifulSoup(markup, “xml”) |
1. 速度快; 2. 唯一支持XML的解析器。 |
需要安装C语言库。 |
html5lib解析器 | BeautifulSoup(markup, “html5lib”) | 1. 最好的容错性; 2. 以浏览器的方式解析文档; 3. 生成HTML5格式的文档。 |
1. 速度慢; 2. 不依赖外部扩展。 |
soup.title # 获取第一个title标签
# The Dormouse's story
soup.title.name # 获取第一个title标签名字
# u'title'
soup.title.string # 获取第一个title标签内的文本内容
# u'The Dormoouse's story'
soup.title.parent.name # 获取第一个title标签父元素的名字
# u'head'
soup.p # 获取第一个p标签
# The Dormouse's story
soup.p['class'] # 获取第一个p标签的class属性值
# u'title'
soup.a # 获取第一个a标签
# Elsie
soup.find_all('a') # 获取所有a标签
"""
[Elsie,
Lacie,
Tillie
]
"""
soup.find(id="link3") # 获取第一个id值为link3的标签
# Tillie
from bs4 import BeautifulSoup as bs
html_doc = """
The Dormouse's story
The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well
...
"""
soup = bs(html_doc, "html.parser")
# print(soup.prettify())
print(soup.title.string) # 获取title标签的内容
print(soup.a) # 获取第一个a标签
print(soup.find(id="link2")) # 获取id=“link2”的元素
print(soup.find(id="link2").string) # 获取id=“link2”的元素的内容(string要获取的内容中不含有标签才行)
print(soup.find(id="link2").get_text()) # 获取id=“link2”的元素的内容
print(soup.find_all("a")) # 获取所有的a标签
print(soup.findAll("a")) # 获取所有的a标签
print([item.string for item in soup.findAll("a")]) # 获取所有的a标签的文本内容 # 列表推导式
print(soup.find("p", {"class": "story"})) # 获取class为story的p标签
print(soup.find("p", {"class": "story"}).get_text()) # 获取class为story的p标签的内容
print(soup.find("p", {"class": "story"}).string) # 获取class为story的p标签的内容 由于获取到的p标签中还含有别的标签,所以无法用string,否则返回None。
print()
import re
# 使用正则表达式
for tag in soup.find_all(re.compile("^b")): # 查找以b开头的标签名
print(tag.name)
# 查找所有a标签中href属性为“http://...”这样的a标签
data = soup.findAll("a", href=re.compile(r"^http://example.com/"))
print(data)
data2 = soup.findAll("a", href=re.compile(r"^http://example\.com/"))
print(data2)
# 文档:https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/#id28
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
# 请求URL并把结果用utf-8编码
resp = urlopen("https://en.wikipedia.org/wiki/Main_Page").read().decode("utf-8")
# 使用BeautifulSoup解析
soup = BeautifulSoup(resp, "html.parser")
# 获取所有以/wiki/开头的a标签的href属性
listUrls = soup.findAll("a", href=re.compile("^/wiki/"))
# 输出所有词条对应的名称和URL
for url in listUrls:
if not re.search("\.(jpg|JPG)$", url['href']): # 过滤掉以.JPG或.jpg结尾的图片URL
# print(url['href']) # 输出不完整的url
# print(url.get_text(), "<--->", url['href']) # 输出对应名字和不完整的url
print(url.get_text(), "<---->", "https://en.wikipedia.org" + url['href']) # 输出对应名字和完整的url
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
# 词条来源的url地址
url = "https://baike.baidu.com/"
# 请求url,并把结果用utf-8编码
resp = urlopen(url).read().decode("utf-8")
# 使用BeautifulSoup解析
soup = BeautifulSoup(resp, "html.parser")
# 获取所有以class="card_cnt_tit"的div
list_divs = soup.findAll("div", {"class": "card_cnt_tit"})
# 根据源码中的规律,我们先找到包裹词条a标签的div
for div in list_divs:
# 再在div标签中用正则表达式中过滤出a标签
a = div.find("a", href=re.compile(r"^https://"))
# 输出词条的名称和链接
print(a.string, "<-------->", a['href'])
pip install pymysql
# 1. 引入开发包
import pymysql.cursors
# 2. 获取数据库链接
connection = pymysql.connect(
host="localhost",
user="root",
password="123456",
db="baikeurl",
charset="utf8mb4")
# 3. 获取会话指针
connection.cursor()
# 4. 执行SQL语句
cursor.execute(sql, (参数1, 参数2, ..., 参数n))
# 5. 提交
connection.commit()
# 6. 关闭
connection.close()
完整代码:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import pymysql.cursors
# 词条来源的url地址
url = "https://baike.baidu.com/"
# 请求url,并把结果用utf-8编码
resp = urlopen(url).read().decode("utf-8")
# 使用BeautifulSoup解析
soup = BeautifulSoup(resp, "html.parser")
# 获取所有以class="card_cnt_tit"的div
list_divs = soup.findAll("div", {"class": "card_cnt_tit"})
# 根据源码中的规律,我们先找到包裹词条a标签的div
for div in list_divs:
# 再在div标签中用正则表达式中过滤出a标签
a = div.find("a", href=re.compile(r"^https://"))
# 输出词条的名称和链接
print(a.string, "<-------->", a['href'])
# 获取数据库链接
connection = pymysql.connect(host="localhost",
user="root",
password="123456",
database="baikeurl",
charset="utf8mb4")
try:
# 获取会话指针
with connection.cursor() as cursor:
# 创建sql语句
sql = "insert into `urls` (`urlname`, `urlhref`)values(%s, %s)"
# 执行sql语句
cursor.execute(sql, (a.get_text(), a['href']))
# 提交
connection.commit()
finally:
connection.close()
# 1. 引入开发包
import pymysql.cursors
# 2. 获取数据库链接
connection = pymysql.connect(
host="localhost",
user="root",
password="123456",
db="baikeurl",
charset="utf8mb4")
# 3. 获取会话指针
connection.cursor()
# 4.1 得到总记录数
cursor.execute()
# 4.2 查询下一行
cursor.fetchchone()
# 4.3 得到指定条数的数据
cursor.fetchmany(size=None)
# 4.4 得到全部
cursor.fetchall()
# 5. 关闭链接
connection.close()
# 导入模块
import pymysql.cursors
# 获取数据库连接
connection = pymysql.connect(host="localhost",
user="root",
password="123456",
database="baikeurl",
charset="utf8mb4")
try:
# 获取会话指针
with connection.cursor() as cursor:
# 查询语句
sql = "select urlname, urlhref from urls where id is not null"
# 查询共有几条记录
count = cursor.execute(sql)
print(count) # 9
# 查询数据
result = cursor.fetchmany(size=3) # 获取前三条数据
# result = cursor.fetchall() # 获取所有数据
print(result)
finally:
connection.close()
from urllib.request import urlopen
# 百度robots协议:https://www.baidu.com/robots.txt
url = "https://www.baidu.com/robots.txt"
html = urlopen(url)
print(html.read().decode('utf-8'))
下载安装pdfminer3k模块:
pip install pdfminer3k
另一种方式,或者是直接去网上下载包,然后解压包,进入包目录(目录中有setup.py文件),使用如下命令直接安装:
python setup.py install
python
import pdfminer
# 导入需要用到的包:
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
# 获取文档对象:
fp = open("Automatic Detection.pdf", "rb") # 以二进制只读方式打开
# 创建一个与文档关联的解释器
parser = PDFParser(fp)
# PDF文档的对象
doc = PDFDocument()
# 连接解释器和文档对象
parser.set_document(doc)
doc.set_parser(parser)
# 初始化文档
doc.initialize("") # 由于文档没有密码,所以里面的密码参数设置为空字符串
# 创建PDF资源管理器
resource = PDFResourceManager()
# 创建参数分析器
laparam = LAParams()
# 创建一个聚合器
device = PDFPageAggregator(resource, laparams=laparam)
# 创建PDF页面解释器
interpreter = PDFPageInterpreter(resource, device)
# 使用文档对象得到页面的集合
for page in doc.get_pages():
# 使用页面解释器来读取
interpreter.process_page(page)
# 使用聚合器来获得内容
layout = device.get_result()
# 获得布局内容
for out in layout: # 利用循环输出布局的每一项
# 避免报错:AttributeError: 'LTFigure' object has no attribute 'get_text'
if hasattr(out, "get_text"):
print(out.get_text())
运行结果:
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from urllib.request import urlopen
# 获取文档对象
# 参考pdf:https://www.tipdm.org/u/cms/www/202107/28162910tww9.pdf
# fp = open("Automatic Detection.pdf", "rb") # 以二进制只读方式打开
fp = urlopen("https://www.tipdm.org/u/cms/www/202107/28162910tww9.pdf") # 以二进制只读方式打开
# 创建一个与文档关联的解释器
parser = PDFParser(fp)
# PDF文档的对象
doc = PDFDocument()
# 连接解释器和文档对象
parser.set_document(doc)
doc.set_parser(parser)
# 初始化文档
doc.initialize("") # 由于文档没有密码,所以里面的密码参数设置为空字符串
# 创建PDF资源管理器
resource = PDFResourceManager()
# 创建参数分析器
laparam = LAParams()
# 创建一个聚合器
device = PDFPageAggregator(resource, laparams=laparam)
# 创建PDF页面解释器
interpreter = PDFPageInterpreter(resource, device)
# 使用文档对象得到页面的集合
for page in doc.get_pages():
# 使用页面解释器来读取
interpreter.process_page(page)
# 使用聚合器来获得内容
layout = device.get_result()
# 获得布局内容
for out in layout: # 利用循环输出布局的每一项
# 避免报错:AttributeError: 'LTFigure' object has no attribute 'get_text'
if hasattr(out, "get_text"):
print(out.get_text())
文章笔记参考课程:https://www.imooc.com/video/12622
代码资源:https://download.csdn.net/download/ungoing/86790114