import requests
import re
def pare_page(url):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36'
}
response = requests.get(url,headers)
text = response.text
#使用正则获取古诗标题
title = re.findall(r'.*?(.*?)',text,re.DOTALL)
# print(titles)
#获取朝代
chaodai = re.findall(r'.*?(.*?)',text)
# print(chaodai)
#获取作者
authors = re.findall(r'.*?.*?(.*?)',text,re.DOTALL)
# print(authors)
contents_tags = re.findall(r'(.*?)',text,re.DOTALL)
contents = []
for content in contents_tags:
# print(content)
x = re.sub(r'<.*?>',"",content)
# print(x.strip())
contents.append(x.strip())
# print(contents)
# for i in range(1,10):
# contents.append(i)
# print(contents)
poems = []
for value in zip(title,chaodai,authors,contents):
title,chaodai,authors,contents = value
poem = {
'诗名' :title,
'朝代':chaodai,
'作者':authors,
'内容':contents
}
poems.append(poem)
a = []
for poem in poems:
print(poem)
print("-"*40)
with open('gushi.txt','a') as f:
f.write(str(poem))
def main():
url = 'https://www.gushiwen.org/default_1.aspx'
for x in range(1,4):
url = 'https://www.gushiwen.org/default_%s.aspx' % x
pare_page(url)
if __name__ == '__main__':
main()