# -*- coding:utf-8 -*-
import csv
import io
import sys
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
import requests
from bs4 import BeautifulSoup # 导入所需库
# 请求头部
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:58.0) Gecko/20100101 Firefox/58.0'}
def get_top(url): # 获取单页
respose = requests.get(url, headers=headers)
respose.encoding="gb18030"
soup = BeautifulSoup(respose.text, 'lxml')
# '景点'
names = soup.find_all('a', class_='blue14b')
# '简介'
papers = soup.find_all(id='tcjs')
# 图片链接
links = soup.select('tr td a img')
data=[]
for name,paper,link in zip(names,papers,links):
data.append([# 获取节点文本
#'景点'
name.get_text(),
# '简介'
paper.get_text().strip(),
#图片链接
link.get('src'),
])
return data
#写入内容函数
def write_1(data,y):
with open(y, "a+",encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
# 写入多行用writerows
writer.writerows(data)
#写入列名函数
def write_2(y):
with open(y, "a+",encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
# 先写入columns_name
writer.writerow(["景点", "简介", "图片链接"])
# 多页爬取
def get_paper(x,y):
write_2(y)
write_1(get_top("http://www.bytravel.cn/view/index{}_list.html".format(x)),y)
for i in range(11):
urls = {'http://www.bytravel.cn/view/index{}_list{}.html'.format(x,i)}
for url in urls:
write_1(get_top(url),y)
if __name__ == '__main__':
list=['320','1283','2305','1283','2304','1232','1056','2303','2300','2302']
name=[ '湛江','雷州市','徐闻县', '霞山区', '遂溪县' ,'廉江市' ,'吴川市', '麻章区' ,'赤坎区' ,'坡头区']
for i in range(0,10):
x=list[i]
y=name[i]+'.csv'
get_paper(x, y)