爬虫练手小项目:豆瓣高分图书TOP100

爬虫练手小项目:豆瓣高分图书TOP100

import requests
import re
from requests.exceptions import RequestException
import json
import time

def get_one_page(url):
    headers ={
            'User-Agent':'Mozilla/5.0 (Macintosh;Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'}
    response = requests.get(url,headers=headers)
    if response.status_code == 200:
        return response.text
    return None


def parse_one_page(html,offset):
    pattern = re.compile('.*?title.*?_blank">(.*?).*?rating_nums">(.*?)',re.S)
    items = re.findall(pattern,html)
    index = offset
    for item in items:
        index = index + 1
        yield {
            'index':index,
            'image':item[0],
            'title':item[1].strip(),
            'score':item[2]
            }

def write_to_file(content):
    with open("doubandushu.txt",'a',encoding='utf-8') as f:
        #print(type(json.dumps(content)))
        f.write(json.dumps(content,ensure_ascii=False)+'\n')

def main(offset):
    url = "https://www.douban.com/doulist/45004834/?"+'start='+str(offset)+'&sort=time&sub_type='
    html = get_one_page(url)
    for item in parse_one_page(html,offset):
        print(item)
        write_to_file(item)

if __name__=='__main__':
    for i in range(0,4):
        main(offset = i*25)
        time.sleep(1)
* 注:1.本项目模仿崔庆才著的《Python3网络爬虫开发实践》3.4节抓取猫眼电影排行;2.博主能力有限,无法写出合适的正则表达式抓取书籍作者信息,在此请教其他网友;


你可能感兴趣的:(Python,WebParser,WebParser,Python3,Cui,Qingcai)