爬虫案例:爬取笔趣阁小说

捣鼓了两天啊,,,,各种问题我要吐了,,,,,,果然我太菜了(哭)

import requests
import re
import os
headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'fikker-spEH-SVuL=SKuZGHo8IKJMQDideJaIQVcfyHkFLb9H; fikker-spEH-SVuL=SKuZGHo8IKJMQDideJaIQVcfyHkFLb9H; Hm_lvt_e73fd8f9f7e092a67d6a312a933f5525=1586390925,1586398042,1586418821,1586431465; Hm_lpvt_e73fd8f9f7e092a67d6a312a933f5525=1586431465; jieqiVisitId=article_articleviews%3D1',
'Host': 'www.biqooge.com',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}

def get_urls(url):
    resp = requests.get(url,headers = headers)
    text = resp.content.decode('gbk')
    data = re.findall(r'
(.+?)
'
,text,re.DOTALL) return data def get_content(data,url): data = data[9:] for each in data: name = each[1]+'.txt' detail_url = each[0].split('/')[-1] urls = url + detail_url resp = requests.get(urls,headers = headers) text = resp.content.decode('gbk') content = re.findall(r'
(.+?)
'
,text,re.DOTALL) content = re.sub(' |
'
,'',content[0]) with open(name,'w',encoding='utf-8') as f: f.write(content) def main(): os.mkdir('小说') os.chdir('小说') url = input('请输入笔趣阁小说网址:') datail_url = get_urls(url) get_content(datail_url,url) if __name__ == '__main__': main()

然后我发现了一个真理,,,,如果你感觉写的代码没问题,,一定时你请求头复制的不够多,,,,,(显得我好垃圾)
反正不知道服务器检测那个,多复制一点肯定没问题啦,,hhhhh

你可能感兴趣的:(爬虫)