# 使用bs4提取网页,先利用find_all解析
import requests
from bs4 import BeautifulSoup
DATA = []
def getHTMLtext(url,headers,timeout=10):
try :
resp = requests.get(url,headers=headers,timeout=timeout)
resp.raise_for_status
resp.encoding = 'utf-8'
return resp.text
except:
return ''
def bs4_find_all_Parser(text):
soup = BeautifulSoup(text,'lxml')
sons = soup.find_all('div',class_ = "sons")[:10] #返回一个,每一个元素都是Tag类型
# 注意:上一步里面返回了一些其他的元素,我们可以提取出前面的10项,那是我们需要用到的
for son in sons:
name = son.find('b').string
print(name)
dynasty_author = son.find('p',class_="source").get_text()
print(dynasty_author)
content = son.find('div',class_="contson").get_text().strip()
print(content)
like = son.find_all('span')[1].string.strip()
print('点赞数:'+like)
print('\n'+'*'*30+'\n')
if __name__ == '__main__':
url = 'https://www.gushiwen.org/default_1.aspx'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'}
text = getHTMLtext(url,headers)
if text == '':
print('url: {} 访问失败'.format(url))
else:
bs4_find_all_Parser(text)
欢迎关注我的公众号【panda一块砖】,分享更多精彩文章。