Python简单使用正则表达式爬取古诗

import requests
import re

def pare_page(url):
    headers = {

    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36'

    }

    response =  requests.get(url,headers)
    text = response.text
    #使用正则获取古诗标题
    title = re.findall(r'.*?(.*?)',text,re.DOTALL)
    # print(titles)
    #获取朝代
    chaodai = re.findall(r'

.*?(.*?)',text) # print(chaodai) #获取作者 authors = re.findall(r'

.*?.*?(.*?)',text,re.DOTALL) # print(authors) contents_tags = re.findall(r'

(.*?)
',text,re.DOTALL) contents = [] for content in contents_tags: # print(content) x = re.sub(r'<.*?>',"",content) # print(x.strip()) contents.append(x.strip()) # print(contents) # for i in range(1,10): # contents.append(i) # print(contents) poems = [] for value in zip(title,chaodai,authors,contents): title,chaodai,authors,contents = value poem = { '诗名' :title, '朝代':chaodai, '作者':authors, '内容':contents } poems.append(poem) a = [] for poem in poems: print(poem) print("-"*40) with open('gushi.txt','a') as f: f.write(str(poem)) def main(): url = 'https://www.gushiwen.org/default_1.aspx' for x in range(1,4): url = 'https://www.gushiwen.org/default_%s.aspx' % x pare_page(url) if __name__ == '__main__': main()

 

你可能感兴趣的:(Python简单使用正则表达式爬取古诗)