爬虫小案例

1) 爬取豆瓣电影

import requests
import json
import pandas as pd
from bs4 import BeautifulSoup
import csv
import os
import time

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
}


# 爬取电影的链接
def getMovieUrl(start):
    movie_url_list = []
    for i in range(start, start+100, 20):
        url = "https://movie.douban.com/j/search_subjects?type=movie&tag=%E5%8D%8E%E8%AF%AD&sort=recommend&page_limit=20&page_start=" + \
            str(i)
        response = requests.get(url, headers=headers)
        data = response.text
        data2 = json.loads(data)
        movie_list = data2["subjects"]
        for i in movie_list:
            movie_url = i['url']
            movie_url_list.append(movie_url)
    return movie_url_list


# 爬取电影里的详细信息
def catchMovieImformation(urls):
    movie_name_list = []
    director_list = []
    actor_list = []
    year_list = []
    score_list = []
    evaluation_list = []
    for movie_url in urls:
        response = requests.get(movie_url, headers=headers)
        data = response.text
        soup = BeautifulSoup(data, "html.parser")
        # 电影名
        movie_name = soup.select("#content > h1 > span:nth-child(1)")
        movie_name = movie_name[0]
        movie_name = movie_name.get_text().replace("\n", "").replace(" ", "")
        movie_name_list.append(movie_name)
        # # 导演
        director = soup.select(
            "#info > span:nth-child(1) > span.attrs > a")
        director = director[0]
        director = director.get_text().replace("\n", "").replace(" ", "")
        director_list.append(director)
        # 主演(前三名)
        actor = soup.select("#info > span.actor > span.attrs")
        actor = actor[0]
        actor = actor.get_text().replace("\n", "").replace(" ", "")
        actors = actor.split("/")
        if len(actors) >= 3:
            actor_list.append({
                "主演1": actors[0],
                "主演2": actors[1],
                "主演3": actors[2],
            })
        elif len(actors) == 2:
            actor_list.append({
                "主演1": actors[0],
                "主演2": actors[1],
            })
        else:
            actor_list.append({
                "主演1": actors[0],
            })
        # 上映年份
        year = soup.select("#content > h1 > span.year")
        year = year[0]
        year = year.get_text().replace("\n", "").replace(
            " ", "").replace("(", "").replace(")", "")
        year_list.append(year)
        # 评分数
        score = soup.select(
            "#interest_sectl > div.rating_wrap.clearbox > div.rating_self.clearfix > strong")
        score = score[0]
        score = score.get_text().replace("\n", "").replace(" ", "")
        score_list.append(score)
        # 评价人数
        evaluation = soup.select(
            "#interest_sectl > div.rating_wrap.clearbox > div.rating_self.clearfix > div > div.rating_sum > a > span")
        evaluation = evaluation[0]
        evaluation = evaluation.get_text().replace("\n", "").replace(" ", "")
        evaluation_list.append(evaluation)
    return movie_name_list, director_list, actor_list, year_list, score_list, evaluation


# 写入csv
def writer(urls,file_path='./movices.csv'):
    res = catchMovieImformation(urls)
    data = {'电影名称': res[0],
            '导演': res[1],
            '主演(前三位)': res[2],
            '上映时间': res[3],
            '评分数': res[4],
            '评价人数': res[5]
            }
    movices= pd.DataFrame(data)
    movices.iloc[:,2] = movices.iloc[:,2].map(lambda x:x.values())
    movices.to_csv(file_path, mode='a', header=False)




start = 0
urls = getMovieUrl(start)
writer(urls)


while len(urls):
    writer(urls)
    start += 100
    time.sleep(3)
    urls = getMovieUrl(start)
    print(start)

movices = pd.read_csv('./movices.csv')
movices    

2) 爬取笔趣网小说《赝太子》

'''
@Descripttion:爬取笔趣阁《赝太子》小说
@version: 1.0.0
@Author: blsm
@Date: 2020-05-28 09:42:46
'''

from bs4 import BeautifulSoup
import requests
import sys


class Downloader():
    def __init__(self):
        self.server = 'https://www.biqugexx.com'
        self.target = 'https://www.biqugexx.com/118_118150/'  # 目录页
        self.namelist = []  # 章节名字
        self.urls = []  # 章节链接
        self.nums = 0  # 章节数

    def get_download_url(self):
        """
        获取列表链接
        """
        req = requests.get(self.target)
        html = req.text
        # 解析html
        bf = BeautifulSoup(html)
        list_bf = bf.find_all('div', id='list')
        a_bf = BeautifulSoup(str(list_bf[0]))
        a = a_bf.find_all('a')
        # 舍弃不需要的12章
        self.nums = len(a[12:])
        for each in a[12:]:
            self.namelist.append(each.string)
            self.urls.append(self.server+each.get('href'))

    def get_contents(self, target):
        """
        获取章节内容
        Arguments:
            target {[type]} -- 下载链接
        """
        req = requests.get(target)
        html = req.text
        bf = BeautifulSoup(html)
        content = bf.find_all('div', id='content')
        content = content[0].text.replace('\xa0'*4, '\n\n')
        return content

    def writer(self, name, path, content):
        """
        将爬取的内容写入文件
        Arguments:
            name {[type]} -- 章节名称
            path {[type]} -- 路径
            content {[type]} -- 章节内容
        """
        with open(path, 'a', encoding='utf-8') as f:
            f.write(name+'\n')
            f.writelines(content)
            f.write('\n\n')


if __name__ == '__main__':
    down = Downloader()
    down.get_download_url()
    path = './赝太子(荆柯守).txt'
    print(u"《赝太子》开始下载")

    for i in range(down.nums):
        down.writer(down.namelist[i], path, down.get_contents(down.urls[i]))
        if i %10 == 0:
            sys.stdout.write(u"已下载:%.3f%%" % float(i/down.nums*100) + '\r')
            sys.stdout.flush()
    print(u"《赝太子》下载完成")

你可能感兴趣的:(python)