python诗词名句网爬取《三国演义》

import requests
import re

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}
#爬取章节内容
def textPaqu(url):
    bigUrl = 'http://www.shicimingju.com'
    new_url = bigUrl+url
    response = requests.get(url=new_url, headers=headers)
    content = response.content.decode('utf-8')
    pat_t = re.compile('

(.*?)

') pat_content = re.compile('

(.*?)

') title = pat_t.findall(content) text = pat_content.findall(content) fr = open(r'text\三国演义.txt', 'a+') fr.write(title[0]+'\n\n') for i in text: fr.write(i.strip().replace(' ',' ')+'\n') fr.write('\n\n') fr.close() #爬取简介和章节名 def paqu(): url = 'http://www.shicimingju.com/book/sanguoyanyi.html' response = requests.get(url=url,headers=headers) content = response.content.decode('utf-8') pat_T = re.compile('

(.*?)

') pat_jianjie = re.compile('

(.*?)

') pat_t = re.compile('
  • .*?
  • ') title = pat_T.findall(content) jianjie = pat_jianjie.findall(content) zhangjie = pat_t.findall(content) fr = open(r'text\三国演义.txt','a+') fr.write(title[0]+'\n') for i in jianjie: fr.write(i+'\n') fr.write('\n\n') fr.close() for i in range(len(zhangjie)): print('正在爬取第%d章'%(i+1)) textPaqu(zhangjie[i]) print('第%d章爬去完成'%(i+1)) paqu()

    你可能感兴趣的:(python爬虫)