from bs4 import BeautifulSoup
def sechBodyURL(path):
#此处因为我的html文件编码格式为gbk,因此加了encoding
fp=open(path,encoding='gbk',errors='ignore')
text=BeautifulSoup(fp,'html.parser')
urls=text.findAll('a')
for u in urls:
print(u['href'])
content=text.get_text().strip()
print(content)
return content
sechBodyURL('20test.html')
运行结果如下,中文存在乱码,是因为原html文件编码问题
引用https://blog.csdn.net/qq_35614920/article/details/76746902