爬取旅游景点

# -*- coding:utf-8 -*-
import csv
import io
import sys
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
import requests
from bs4 import BeautifulSoup  # 导入所需库

# 请求头部
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:58.0) Gecko/20100101 Firefox/58.0'}


def get_top(url):  # 获取单页
    respose = requests.get(url, headers=headers)
    respose.encoding="gb18030"
    soup = BeautifulSoup(respose.text, 'lxml')

    # '景点'
    names = soup.find_all('a', class_='blue14b')
    # '简介'
    papers = soup.find_all(id='tcjs')
    # 图片链接
    links = soup.select('tr td a img')

    data=[]
    for name,paper,link in zip(names,papers,links):

        data.append([# 获取节点文本
            #'景点'
            name.get_text(),
            # '简介'
            paper.get_text().strip(),
            #图片链接
            link.get('src'),
        ])
    return data

#写入内容函数
def write_1(data,y):
    with open(y, "a+",encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        # 写入多行用writerows
        writer.writerows(data)

#写入列名函数
def write_2(y):
    with open(y, "a+",encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        # 先写入columns_name
        writer.writerow(["景点", "简介", "图片链接"])

# 多页爬取
def get_paper(x,y):
    write_2(y)
    write_1(get_top("http://www.bytravel.cn/view/index{}_list.html".format(x)),y)
    for i in range(11):
        urls = {'http://www.bytravel.cn/view/index{}_list{}.html'.format(x,i)}
        for url in urls:
            write_1(get_top(url),y)

if __name__ == '__main__':
    list=['320','1283','2305','1283','2304','1232','1056','2303','2300','2302']
    name=[ '湛江','雷州市','徐闻县', '霞山区', '遂溪县' ,'廉江市' ,'吴川市', '麻章区' ,'赤坎区' ,'坡头区']
    for i in range(0,10):
        x=list[i]
        y=name[i]+'.csv'
        get_paper(x, y)


你可能感兴趣的:(python)