python爬取天气数据

前几天做项目要用到天气数据,就写了个爬虫脚本爬了一些,希望可以帮到大家。
python爬取天气数据_第1张图片
python爬取天气数据_第2张图片
代码如下

import pandas as pd
from selenium import webdriver
import calendar

#输入你想要的
place=['shenzhen','shanghai','beijing','guangzhou','tianjin','wuhan','chongqing','xiamen']
start_year=2013
start_month=1
end_year=2021
end_month=6
end_day=20
path='C:/Users/86189/Desktop/论文数据/rawdata/地区气候/'
def crawl(place,start_year,start_month,end_year,end_month,end_day):
    out_path = path + place + '.csv'
    # 函数
    def add_zero(number):
        if 0 < number < 10:
            return '0' + str(number)
        else:
            return str(number)

    def make_date_range(start_year, start_month, end_year, end_month):
        date_range = []
        year_month = []
        for j in range(start_month, 13):
            monthstr = add_zero(j)
            date_range.append(str(start_year) + monthstr)
            year_month.append([start_year, j])

        for year in range(start_year + 1, end_year):
            for j in range(1, 13):
                monthstr = add_zero(j)
                date_range.append(str(year) + monthstr)
                year_month.append([year, j])

        for j in range(1, end_month + 1):
            monthstr = add_zero(j)
            date_range.append(str(end_year) + monthstr)
            year_month.append([end_year, j])

        return date_range, year_month

    # 准备数据、爬虫
    chrome_options = webdriver.ChromeOptions()
    chrome_options.headless = True
    chrome = webdriver.Chrome(chrome_options=chrome_options)
    time_list, year_month = make_date_range(start_year, start_month, end_year, end_month)
    url_list = []
    for i in time_list:
        url_list.append('https://lishi.tianqi.com/' + place + '/' + i + '.html')
    out_data = [[], [], [], [], []]

    # 运行爬虫
    for i in range(len(url_list)):
        chrome.get(url_list[i])
        x, day_num = calendar.monthrange(year_month[i][0], year_month[i][1])
        if i == len(url_list) - 1:
            day_num = end_day
        button = chrome.find_element_by_xpath('/html/body/div[7]/div[1]/div[4]/ul/div')
        # 执行单击操作
        button.click()
        for day in range(day_num):
            day += 1

            xpath = '/html/body/div[7]/div[1]/div[4]/ul/li[' + str(day)
            try:
                for i in range(1, 6):
                    all_xpath = xpath + ']/div[' + str(i) + ']'
                    text = chrome.find_element_by_xpath(all_xpath).text
                    text = text.replace('℃', '')
                    if text[-3:-1] == '星期':
                        text = text[:-3]
                    out_data[i - 1].append(text)
            except:
                pass

    # 数据导出
    out_table = pd.DataFrame()
    out_table['date'] = out_data[0]
    out_table['hight_temprature(℃)'] = out_data[1]
    out_table['low_temprature(℃)'] = out_data[2]
    out_table['weather'] = out_data[3]
    out_table['The direction of the wind'] = out_data[4]
    out_table.to_csv(out_path, encoding='gbk')
    chrome.quit()
for p in place:
    crawl(p,start_year,start_month,end_year, end_month, end_day)




你可能感兴趣的:(python,开发语言,后端,爬虫)