【Python】08 通用水文年鉴数据提取方法

问题描述

现有大约160份水文年鉴数据Excel表格,包含水位、降水量、水文要素摘录表和逐日表,格式如下图所示。
【Python】08 通用水文年鉴数据提取方法_第1张图片
需将其存储到数据库中,如下图所示。
【Python】08 通用水文年鉴数据提取方法_第2张图片

问题分析

如此大量的数据,如果采用手动复制粘贴的方法,需要耗费大量时间。考虑到水文年鉴数据格式统一,可以采用Python对Excel表格数据进行提取,大致分为3步。

  1. 将水文年鉴数据提取到对应的Newstyle Excel表中,数据组织形式如下图所示。
    【Python】08 通用水文年鉴数据提取方法_第3张图片

  2. 将所有的Newstyle Excel表汇总成一个水文数据汇总表,如下图所示
    【Python】08 通用水文年鉴数据提取方法_第4张图片

  3. 在Access中导入各水文要素记录
    【Python】08 通用水文年鉴数据提取方法_第5张图片

实现方法

逐日降水量

import xlrd
import xlwt
import calendar
import arrow
import os  

# 输出该目录下所有子文件目录  
def get_file_name(file_dir):
    list_of_files = []

    for root, dirs, files in os.walk(file_dir):  
        #print(root) #当前目录路径  
        #print(dirs) #当前路径下所有子目录  
        #print(files) #当前路径下所有非目录子文件
        for file in files:
            #print(os.path.join(root, file))
            list_of_files.append(os.path.join(root, file))

    return list_of_files
        
def isLeapYear(years):
    '''
    通过判断闰年,获取年份years下一年的总天数
    :param years: 年份,int
    :return:days_sum,一年的总天数
    '''
    # 断言:年份不为整数时,抛出异常。
    assert isinstance(years, int), "请输入整数年,如 2018"
 
    if ((years % 4 == 0 and years % 100 != 0) or (years % 400 == 0)):  # 判断是否是闰年
        # print(years, "是闰年")
        days_sum = 366
        return days_sum
    else:
        # print(years, '不是闰年')
        days_sum = 365
        return days_sum
 
 
def getAllDayPerYear(years):
    '''
    获取一年的所有日期
    :param years:年份
    :return:全部日期列表
    '''
    start_date = '%s-1-1' % years
    a = 0
    all_date_list = []
    days_sum = isLeapYear(int(years))
    while a < days_sum:
        b = arrow.get(start_date).shift(days=a).format("YYYY-MM-DD")
        a += 1
        all_date_list.append(b)
    # print(all_date_list)
    return all_date_list

# 用列表推导去除列表空白列
def remove_space(list_of_data, index_of_space):
    list_of_no_space_data = []
    list_of_no_space_data = [data for index, data in enumerate(list_of_data) if index not in index_of_space]
    return list_of_no_space_data

# 从原始逐日降水量表转换成archigh能接受的形式
def transform(file_name):

    data = xlrd.open_workbook(file_name)

    table = data.sheets()[0]

    # 一年中的逐日降水量
    list_of_precipitation = []

    # 年份
    year = int(table.cell(1, 2).value)

    # 测站编码
    station_code = table.cell(1, 5).value

    for month in range(1, 13):
        # 天数
        days = calendar.monthrange(year, month)[1]

        list_of_precipitation += table.col_values(1 + month)[4 : 4 + days]

    # 去除降水量数据中的·*符号
    for index, precipitation in enumerate(list_of_precipitation):
        list_of_precipitation[index] = precipitation.replace("·",'').replace("*",'')

    # 获取该年份所有日期
    days_of_year = getAllDayPerYear(str(year))

    # 开始日期
    start_date = days_of_year

    # 结束日期为开始日期后移一天
    end_date = days_of_year[:]

    del end_date[0]

    end_date.append(str(year + 1) + "-01-01")

    # 去除降水量为0(空白)的记录
    # 记录空白行的索引
    index_of_space = []
    for index, precipitation in enumerate(list_of_precipitation):
        if(precipitation == ""):
            index_of_space.append(index)
    # 用列表推导删除降水量的空白行
    start_date = remove_space(start_date, index_of_space)
    end_date = remove_space(end_date, index_of_space)
    list_of_precipitation = remove_space(list_of_precipitation, index_of_space)
    
    # 输出
    output = xlwt.Workbook()

    sheet1 = output.add_sheet("逐日降水量")

    sheet1.write(0, 0, "测站编码")

    sheet1.write(0, 1, "开始日期")

    sheet1.write(0, 2, "结束日期")

    sheet1.write(0, 3, "降水量(mm)")

    # 写入测站编码
    for row, _ in enumerate(list_of_precipitation, 1):
        sheet1.write(row, 0, station_code)
        
    # 写入开始日期
    for row, day in enumerate(start_date, 1):
        sheet1.write(row, 1, day)

    # 写入结束日期
    for row, day in enumerate(end_date, 1):
        sheet1.write(row, 2, day)

    # 写入降水量    
    for row, precipitation in enumerate(list_of_precipitation, 1):
        sheet1.write(row, 3, precipitation)

    output.save(os.path.splitext(file_name)[0] + "NewStyle" + ".xls")


# 主函数
if __name__=="__main__":

    print("===本程序将逐日降水量表转换为ArcHIGH能接受的形式NewStyle===")

    # 该目录下所有文件名
    list_of_files = []

    list_of_files = get_file_name(os.getcwd())

    #print(list_of_files)

    for file_name in list_of_files:
        if "逐日降水量表.xls" in file_name:
            transform(file_name)

    print("===转换完毕,请查看...逐日降水量表NewStyle.xls文件===")    

逐日平均水位

import xlrd
import xlwt
import calendar
import arrow
import os  

# 输出该目录下所有子文件目录  
def get_file_name(file_dir):
    list_of_files = []

    for root, dirs, files in os.walk(file_dir):  
        #print(root) #当前目录路径  
        #print(dirs) #当前路径下所有子目录  
        #print(files) #当前路径下所有非目录子文件
        for file in files:
            #print(os.path.join(root, file))
            list_of_files.append(os.path.join(root, file))

    return list_of_files
        
def isLeapYear(years):
    '''
    通过判断闰年,获取年份years下一年的总天数
    :param years: 年份,int
    :return:days_sum,一年的总天数
    '''
    # 断言:年份不为整数时,抛出异常。
    assert isinstance(years, int), "请输入整数年,如 2018"
 
    if ((years % 4 == 0 and years % 100 != 0) or (years % 400 == 0)):  # 判断是否是闰年
        # print(years, "是闰年")
        days_sum = 366
        return days_sum
    else:
        # print(years, '不是闰年')
        days_sum = 365
        return days_sum
 
 
def getAllDayPerYear(years):
    '''
    获取一年的所有日期
    :param years:年份
    :return:全部日期列表
    '''
    start_date = '%s-1-1' % years
    a = 0
    all_date_list = []
    days_sum = isLeapYear(int(years))
    while a < days_sum:
        b = arrow.get(start_date).shift(days=a).format("YYYY-MM-DD")
        a += 1
        all_date_list.append(b)
    # print(all_date_list)
    return all_date_list

# 从原始逐日平均水位表转换成archigh能接受的形式
def transform(file_name):

    data = xlrd.open_workbook(file_name)

    table = data.sheets()[0]

    # 一年中的逐日平均水位
    stage_of_year = []

    # 年份
    year = int(table.cell(2, 1).value)

    # 测站编码
    station_code = table.cell(2, 4).value

    # 表内水位与85基准水位差值
    stage_change = eval(table.cell(2, 9).value)

    # 基准面转换关系
    datum = table.cell(2, 6).value + table.cell(2, 9).value\
            + table.cell(2, 10).value + table.cell(2, 11).value

    # 读取每天的水位
    for month in range(1, 13):
        # 天数
        days = calendar.monthrange(year, month)[1]

        stage_of_year += table.col_values(1 + month)[5 : 5 + days]

    # 补齐缺少整数位的水位数值
    stage_int = '0'    # 水位整数位,初始值为0
    
    for index, stage in enumerate(stage_of_year):

        if '.' in stage:
            stage_int = stage.split('.')[0]
        else:
            stage = stage_int + '.' + stage
            stage_of_year[index] = stage

    # 去除水位数据中的X符号
    for index, stage in enumerate(stage_of_year):
        stage_of_year[index] = stage.replace("X",'')

    # 计算85基准水位
    list_of_stage_85 = [str(round(eval(stage) + stage_change, 3)) for stage in stage_of_year]

    # 输出
    output = xlwt.Workbook()

    sheet1 = output.add_sheet("逐日平均水位")

    sheet1.write(0, 0, "测站编码")

    sheet1.write(0, 1, "日期")

    sheet1.write(0, 2, "85基准水位(m)")

    sheet1.write(0, 3, "表内水位(m)")

    sheet1.write(0, 4, "基准面转换关系")

    sheet1.write(1, 4, datum)

    # 获取该年份所有日期
    days_of_year = getAllDayPerYear(str(year))

    # 写入测站编码
    for row, _ in enumerate(stage_of_year, 1):
        sheet1.write(row, 0, station_code)

    # 日期    
    for row, day in enumerate(days_of_year, 1):
        sheet1.write(row, 1, day)

    # 85水位
    for row, stage in enumerate(list_of_stage_85, 1):
        sheet1.write(row, 2, stage)

    # 表内水位    
    for row, stage in enumerate(stage_of_year, 1):
        sheet1.write(row, 3, stage)

    output.save(os.path.splitext(file_name)[0] + "NewStyle" + ".xls")


# 主函数
if __name__=="__main__":

    print("===本程序将逐日平均水位表转换为ArcHIGH能接受的形式NewStyle===")

    # 该目录下所有文件名
    list_of_files = []

    list_of_files = get_file_name(os.getcwd())

    #print(list_of_files)

    for file_name in list_of_files:
        if "逐日平均水位表.xls" in file_name:
            print(file_name)
            transform(file_name)

    print("===转换完毕,请查看...逐日平均水位表NewStyle.xls文件===")    

逐日水面蒸发量

import xlrd
import xlwt
import calendar
import arrow
import os
import re

# 输出该目录下所有子文件目录  
def get_file_name(file_dir):
    list_of_files = []

    for root, dirs, files in os.walk(file_dir):  
        #print(root) #当前目录路径  
        #print(dirs) #当前路径下所有子目录  
        #print(files) #当前路径下所有非目录子文件
        for file in files:
            #print(os.path.join(root, file))
            list_of_files.append(os.path.join(root, file))

    return list_of_files
        
def isLeapYear(years):
    '''
    通过判断闰年,获取年份years下一年的总天数
    :param years: 年份,int
    :return:days_sum,一年的总天数
    '''
    # 断言:年份不为整数时,抛出异常。
    assert isinstance(years, int), "请输入整数年,如 2018"
 
    if ((years % 4 == 0 and years % 100 != 0) or (years % 400 == 0)):  # 判断是否是闰年
        # print(years, "是闰年")
        days_sum = 366
        return days_sum
    else:
        # print(years, '不是闰年')
        days_sum = 365
        return days_sum
 
 
def getAllDayPerYear(years):
    '''
    获取一年的所有日期
    :param years:年份
    :return:全部日期列表
    '''
    start_date = '%s-1-1' % years
    a = 0
    all_date_list = []
    days_sum = isLeapYear(int(years))
    while a < days_sum:
        b = arrow.get(start_date).shift(days=a).format("YYYY-MM-DD")
        a += 1
        all_date_list.append(b)
    # print(all_date_list)
    return all_date_list

# 从原始逐日水面蒸发量表转换成archigh能接受的形式
def transform(file_name):

    data = xlrd.open_workbook(file_name)

    table = data.sheets()[0]

    # 一年中的逐日水面蒸发量
    evaporation_of_year = []

    # 年份和测站编码从合并单元格最左格提取
    year_stcd_info = table.cell(1, 0).value # 最左格值

    year_stcd_info = re.split('[: ]', year_stcd_info) # 用空格和冒号分割列表

    year = int(year_stcd_info[1]) # 年份信息在列表第2位,并转化为整型

    # 测站编码
    station_code = int(year_stcd_info[3])
    
    for month in range(1, 13):
        # 天数
        days = calendar.monthrange(year, month)[1]

        evaporation_of_year += table.col_values(1 + month)[4 : 4 + days]


    # 输出
    output = xlwt.Workbook()

    sheet1 = output.add_sheet("逐日水面蒸发量")

    sheet1.write(0, 0, "测站编码")

    sheet1.write(0, 1, "日期")

    sheet1.write(0, 2, "水面蒸发量(mm)")

    # 获取该年份所有日期
    days_of_year = getAllDayPerYear(str(year))

    # 写入测站编码
    for row, _ in enumerate(evaporation_of_year, 1):
        sheet1.write(row, 0, station_code)

    # 日期    
    for row, day in enumerate(days_of_year, 1):
        sheet1.write(row, 1, day)

    # 蒸散发
    for row, evaporation in enumerate(evaporation_of_year, 1):
        sheet1.write(row, 2, evaporation)

    output.save(os.path.splitext(file_name)[0] + "NewStyle" + ".xls")


# 主函数
if __name__=="__main__":

    print("===本程序将逐日水面蒸发量表转换为ArcHIGH能接受的形式NewStyle===")

    # 该目录下所有文件名
    list_of_files = []

    list_of_files = get_file_name(os.getcwd())

    #print(list_of_files)

    for file_name in list_of_files:
        if "逐日水面蒸发量表.xls" in file_name:
            transform(file_name)

    print("===转换完毕,请查看...逐日水面蒸发量表NewStyle.xls文件===")    

降水量摘录

import xlrd
import xlwt
import os
import datetime

# 输出该目录下所有子文件目录  
def get_file_name(file_dir):
    list_of_files = []

    for root, dirs, files in os.walk(file_dir):  
        #print(root) #当前目录路径  
        #print(dirs) #当前路径下所有子目录  
        #print(files) #当前路径下所有非目录子文件
        for file in files:
            #print(os.path.join(root, file))
            list_of_files.append(os.path.join(root, file))

    return list_of_files

# 用列表推导去除列表空白列
def remove_space(list_of_data, index_of_space):
    list_of_no_space_data = []
    list_of_no_space_data = [data for index, data in enumerate(list_of_data) if index not in index_of_space]
    return list_of_no_space_data

# 降水量摘录表中结束时间有24时,这个不合格,弄成下一天0时
def change_time_type(year, month, day):
    current_day = datetime.datetime(year, month, day, 00, 00, 00)
    next_day = current_day + datetime.timedelta(days = 1)
    return next_day.strftime("%Y-%m-%d %H:%M:%S")
    
# 从降水量摘录表转换成archigh能接受的形式
def transform(file_name):

    data = xlrd.open_workbook(file_name)

    table = data.sheets()[0]

    # 年份
    year = table.cell(1, 2).value
    
    # 测站编码
    station_code = table.cell(1, 5).value

    # 页数
    pages = int(table.cell(1, 17).value.split()[1])

    # 月份
    list_of_month = []
    # 日期
    list_of_day = []
    # 开始时间 时:分
    list_of_start_time = []
    # 结束时间
    list_of_end_time = []
    # 降水量
    list_of_precipitation = []

    # 按照页码和列循环读取日期和降水量      
    for page in range(0, pages):
        for col in range(0, 4):
            list_of_month += table.col_values(0 + 5 * col)[4 + 54 * page : 4 + 50 + 54 * page]
            list_of_day += table.col_values(1 + 5 * col)[4 + 54 * page : 4 + 50 + 54 * page]
            list_of_start_time += table.col_values(2 + 5 * col)[4 + 54 * page : 4 + 50 + 54 * page]
            list_of_end_time += table.col_values(3 + 5 * col)[4 + 54 * page : 4 + 50 + 54 * page]
            list_of_precipitation += table.col_values(4 + 5 * col)[4 + 54 * page : 4 + 50 + 54 * page]

    # 去除掉最后一页的空白行
    # 记录空白行的索引
    index_of_space = []
    for index, precipitation in enumerate(list_of_precipitation):
        if(precipitation == ""):
            index_of_space.append(index)
    # 用列表推导删除月份、日期、开始时间、结束时间、降水量的最后一页空白行
    list_of_month = remove_space(list_of_month, index_of_space)
    list_of_day = remove_space(list_of_day, index_of_space)
    list_of_start_time = remove_space(list_of_start_time, index_of_space)
    list_of_end_time = remove_space(list_of_end_time, index_of_space)
    list_of_precipitation = remove_space(list_of_precipitation, index_of_space)

        
##    # 去除降水量数据中的·*符号
##    for index, precipitation in enumerate(precipitation_of_year):
##        precipitation_of_year[index] = precipitation.replace("·",'').replace("*",'')

    # 处理开始时间和结束时间
    # 开始日期时间
    list_of_start_date_time = []
    # 结束日期时间
    list_of_end_date_time = []

    # 记录当前月份和日期
    current_month = "0"
    current_day = "0"
    for month, day, start_time, end_time in zip(list_of_month, list_of_day, list_of_start_time, list_of_end_time):
        # 只有需要更新日期时才有可能要更新月份信息
        if(day != ""):
            current_day = day
            
            if(month != ""):
                current_month = month
            
        # 给出日期时间格式
        start_date_time = "{}-{}-{} {}:00:00".format(int(year), int(current_month), int(current_day), int(start_time))
        # 处理结束时间为24时的特殊情况
        if end_time == 24:
            end_date_time = change_time_type(int(year), int(current_month), int(current_day))
        else:
            end_date_time = "{}-{}-{} {}:00:00".format(int(year), int(current_month), int(current_day), int(end_time))
                
        # 记录日期时间到列表中以便于输出
        list_of_start_date_time.append(start_date_time)
        list_of_end_date_time.append(end_date_time)
    
    # 输出
    output = xlwt.Workbook()

    sheet1 = output.add_sheet("降水量摘录")

    sheet1.write(0, 0, "测站编码")

    sheet1.write(0, 1, "开始时间")

    sheet1.write(0, 2, "结束时间")

    sheet1.write(0, 3, "降水量(mm)")
    
    # 写入测站编码
    for row, _ in enumerate(list_of_precipitation, 1):
        sheet1.write(row, 0, station_code)
        
    # 写入开始时间
    for row, start_date_time in enumerate(list_of_start_date_time, 1):
        sheet1.write(row, 1, start_date_time)

    # 写入结束时间
    for row, end_date_time in enumerate(list_of_end_date_time, 1):
        sheet1.write(row, 2, end_date_time)

    # 写入降水量    
    for row, precipitation in enumerate(list_of_precipitation, 1):
        sheet1.write(row, 3, precipitation)

    output.save(os.path.splitext(file_name)[0] + "NewStyle" + ".xls")


# 主函数
if __name__=="__main__":

    print("===本程序将降水量摘录表转换为ArcHIGH能接受的形式NewStyle===")

    # 该目录下所有文件名
    list_of_files = []

    list_of_files = get_file_name(os.getcwd())

    #print(list_of_files)

    for file_name in list_of_files:
        if "降水量摘录表.xls" in file_name:
            transform(file_name)

    print("===转换完毕,请查看...降水量摘录表NewStyle.xls文件===")    

洪水水位摘录

import xlrd
import xlwt
import os
import re

# 输出该目录下所有子文件目录  
def get_file_name(file_dir):
    list_of_files = []

    for root, dirs, files in os.walk(file_dir):  
        #print(root) #当前目录路径  
        #print(dirs) #当前路径下所有子目录  
        #print(files) #当前路径下所有非目录子文件
        for file in files:
            #print(os.path.join(root, file))
            list_of_files.append(os.path.join(root, file))

    return list_of_files

# 用列表推导去除列表空白列
def remove_space(list_of_data, index_of_space):
    list_of_no_space_data = []
    list_of_no_space_data = [data for index, data in enumerate(list_of_data) if index not in index_of_space]
    return list_of_no_space_data
    
# 从洪水水位摘录表转换成archigh能接受的形式
def transform(file_name):

    data = xlrd.open_workbook(file_name)

    table = data.sheets()[0]

    # 年份
    year = table.cell(1, 2).value

    # 测站编码
    station_code = table.cell(1, 5).value

    # 从相应逐日平均水位表读取基准面转换关系
    data1 = xlrd.open_workbook(file_name.replace("洪水水位摘录表", "逐日平均水位表"))

    table1 = data1.sheets()[0]

    # 表内水位与85基准水位差值
    stage_change = eval(table1.cell(2, 9).value)

    # 基准面转换关系
    datum = table1.cell(2, 6).value + table1.cell(2, 9).value\
            + table1.cell(2, 10).value + table1.cell(2, 11).value
    
    # 页数
    pages = int(re.findall("\d+", table.cell(1, 17).value.split()[0])[0])

    # 月份
    list_of_month = []
    # 日期
    list_of_day = []
    # 时间 时:分
    list_of_time = []
    # 水位
    list_of_stage = []

    # 按照页码和列循环读取日期和降水量      
    for page in range(0, pages):
        for col in range(0, 5):
            list_of_month += table.col_values(0 + 4 * col)[4 + 54 * page : 4 + 50 + 54 * page]
            list_of_day += table.col_values(1 + 4 * col)[4 + 54 * page : 4 + 50 + 54 * page]
            list_of_time += table.col_values(2 + 4 * col)[4 + 54 * page : 4 + 50 + 54 * page]
            list_of_stage += table.col_values(3 + 4 * col)[4 + 54 * page : 4 + 50 + 54 * page]

    # 去除掉最后一页的空白行
    # 记录空白行的索引
    index_of_space = []
    for index, stage in enumerate(list_of_stage):
        if(stage == ""):
            index_of_space.append(index)
    # 用列表推导删除月份、日期、时间、水位的最后一页空白行
    list_of_month = remove_space(list_of_month, index_of_space)
    list_of_day = remove_space(list_of_day, index_of_space)
    list_of_time = remove_space(list_of_time, index_of_space)
    list_of_stage = remove_space(list_of_stage, index_of_space)
        
    # 处理日期时间
    list_of_date_time = []

    # 记录当前月份和日期
    current_month = "0"
    current_day = "0"
    for month, day, time in zip(list_of_month, list_of_day, list_of_time):
        # 只有需要更新日期时才有可能要更新月份信息
        if(day != ""):
            current_day = day
            
            if(month != ""):
                current_month = month
            
        # 给出日期时间格式
        if(type(time) == float):
            time = str(int(time)) + ":00"
        date_time = "{}-{}-{} {}:00".format(int(year), int(current_month), int(current_day), time)

        # 记录日期时间到列表中以便于输出
        list_of_date_time.append(date_time)

    # 补齐缺少整数位的水位数值
    stage_int = '0'    # 水位整数位
    
    for index, stage in enumerate(list_of_stage):

        if '.' in stage:
            stage_int = stage.split('.')[0]
        else:
            stage = stage_int + '.' + stage
            list_of_stage[index] = stage

    # 计算85基准水位
    list_of_stage_85 = [str(round(eval(stage) + stage_change, 3)) for stage in list_of_stage]

    # 输出
    output = xlwt.Workbook()

    sheet1 = output.add_sheet("洪水水位摘录")

    sheet1.write(0, 0, "测站编码")

    sheet1.write(0, 1, "日期时间")

    sheet1.write(0, 2, "85基准水位(m)")

    sheet1.write(0, 3, "表内水位(m)")

    sheet1.write(0, 4, "基准面转换关系")

    sheet1.write(1, 4, datum)

    # 写入测站编码
    for row, _ in enumerate(list_of_stage, 1):
        sheet1.write(row, 0, station_code)

    # 写入日期时间
    for row, date_time in enumerate(list_of_date_time, 1):
        sheet1.write(row, 1, date_time)

    # 85水位
    for row, stage in enumerate(list_of_stage_85, 1):
        sheet1.write(row, 2, stage)

    # 表内水位    
    for row, stage in enumerate(list_of_stage, 1):
        sheet1.write(row, 3, stage)

    output.save(os.path.splitext(file_name)[0] + "NewStyle" + ".xls")


# 主函数
if __name__=="__main__":

    print("===本程序将洪水水位摘录表转换为ArcHIGH能接受的形式NewStyle===")

    # 该目录下所有文件名
    list_of_files = []

    list_of_files = get_file_name(os.getcwd())

    #print(list_of_files)

    for file_name in list_of_files:
        if "洪水水位摘录表.xls" in file_name:
            transform(file_name)

    print("===转换完毕,请查看...洪水水位摘录表NewStyle.xls文件===")    

洪水水文要素摘录

import xlrd
import xlwt
import os
import re

# 输出该目录下所有子文件目录  
def get_file_name(file_dir):
    list_of_files = []

    for root, dirs, files in os.walk(file_dir):  
        #print(root) #当前目录路径  
        #print(dirs) #当前路径下所有子目录  
        #print(files) #当前路径下所有非目录子文件
        for file in files:
            #print(os.path.join(root, file))
            list_of_files.append(os.path.join(root, file))

    return list_of_files

# 用列表推导去除列表空白列
def remove_space(list_of_data, index_of_space):
    list_of_no_space_data = []
    list_of_no_space_data = [data for index, data in enumerate(list_of_data) if index not in index_of_space]
    return list_of_no_space_data
    
# 从洪水水文要素摘录表转换成archigh能接受的形式
def transform(file_name):

    data = xlrd.open_workbook(file_name)

    table = data.sheets()[0]

    # 年份
    year = table.cell(2, 2).value

    # 测站编码
    station_code = table.cell(2, 6).value

    # 从相应逐日平均水位表读取基准面转换关系
    data1 = xlrd.open_workbook(file_name.replace("洪水水文要素摘录表(二要素)", "逐日平均水位表"))

    table1 = data1.sheets()[0]

    # 表内水位与85基准水位差值
    stage_change = eval(table1.cell(2, 9).value)

    # 基准面转换关系
    datum = table1.cell(2, 6).value + table1.cell(2, 9).value\
            + table1.cell(2, 10).value + table1.cell(2, 11).value

    # 页数
    pages = int(re.findall("\d+", table.cell(2, 18).value.split()[0])[0])

    # 月份
    list_of_month = []
    # 日期
    list_of_day = []
    # 时间 时:分
    list_of_time = []
    # 水位
    list_of_stage = []
    # 流量
    list_of_discharge = []

    # 按照页码和列循环读取日期、水位和流量
    for page in range(0, pages):
        for col in range(0, 4):
            list_of_month += table.col_values(0 + 5 * col)[5 + 55 * page : 5 + 50 + 55 * page]
            list_of_day += table.col_values(1 + 5 * col)[5 + 55 * page : 5 + 50 + 55 * page]
            list_of_time += table.col_values(2 + 5 * col)[5 + 55 * page : 5 + 50 + 55 * page]
            list_of_stage += table.col_values(3 + 5 * col)[5 + 55 * page : 5 + 50 + 55 * page]
            list_of_discharge += table.col_values(4 + 5 * col)[5 + 55 * page : 5 + 50 + 55 * page]

    # 去除掉最后一页的空白行
    # 记录空白行的索引
    index_of_space = []
    for index, stage in enumerate(list_of_stage):
        if(stage == ""):
            index_of_space.append(index)
    # 用列表推导删除月份、日期、时间、水位的最后一页空白行
    list_of_month = remove_space(list_of_month, index_of_space)
    list_of_day = remove_space(list_of_day, index_of_space)
    list_of_time = remove_space(list_of_time, index_of_space)
    list_of_stage = remove_space(list_of_stage, index_of_space)
    list_of_discharge = remove_space(list_of_discharge, index_of_space)
        
    # 处理日期时间
    list_of_date_time = []

    # 记录当前月份和日期
    current_month = "0"
    current_day = "0"
    for month, day, time in zip(list_of_month, list_of_day, list_of_time):
        # 只有需要更新日期时才有可能要更新月份信息
        if(day != ""):
            current_day = day
            
            if(month != ""):
                current_month = month
            
        # 给出日期时间格式
        if(type(time) == float):
            time = str(int(time)) + ":00"
        date_time = "{}-{}-{} {}:00".format(int(year), int(current_month), int(current_day), time)

        # 记录日期时间到列表中以便于输出
        list_of_date_time.append(date_time)

    # 补齐缺少整数位的水位数值
    stage_int = '0'    # 水位整数位
    
    for index, stage in enumerate(list_of_stage):

        if '.' in stage:
            stage_int = stage.split('.')[0]
        else:
            stage = stage_int + '.' + stage
            list_of_stage[index] = stage

    # 计算85基准水位
    list_of_stage_85 = [str(round(eval(stage) + stage_change, 3)) for stage in list_of_stage]
    
    # 输出
    output = xlwt.Workbook()

    sheet1 = output.add_sheet("洪水水文要素摘录")

    sheet1.write(0, 0, "测站编码")

    sheet1.write(0, 1, "日期时间")

    sheet1.write(0, 2, "85基准水位(m)")

    sheet1.write(0, 3, "流量(m3/s)")

    sheet1.write(0, 4, "表内水位(m)")

    sheet1.write(0, 5, "基准面转换关系")

    sheet1.write(1, 5, datum)

    # 写入测站编码
    for row, _ in enumerate(list_of_stage, 1):
        sheet1.write(row, 0, station_code)
        
    # 写入日期时间
    for row, date_time in enumerate(list_of_date_time, 1):
        sheet1.write(row, 1, date_time)

    # 85水位
    for row, stage in enumerate(list_of_stage_85, 1):
        sheet1.write(row, 2, stage)

    # 写入流量   
    for row, discharge in enumerate(list_of_discharge, 1):
        sheet1.write(row, 3, discharge)

    # 写入表内水位   
    for row, stage in enumerate(list_of_stage, 1):
        sheet1.write(row, 4, stage)

    output.save(os.path.splitext(file_name)[0] + "NewStyle" + ".xls")


# 主函数
if __name__=="__main__":

    print("===本程序将洪水水文要素摘录表(二要素)转换为ArcHIGH能接受的形式NewStyle===")

    # 该目录下所有文件名
    list_of_files = []

    list_of_files = get_file_name(os.getcwd())

    #print(list_of_files)

    for file_name in list_of_files:
        if "洪水水文要素摘录表(二要素).xls" in file_name:
            transform(file_name)

    print("===转换完毕,请查看...洪洪水水文要素摘录表(二要素)NewStyle.xls文件===")    

水文数据汇总

import xlrd
import xlwt
import os  

#====将所有的NewStyle提取到一张汇总表上====

# 输出该目录下所有子文件目录  
def get_file_name(file_dir):
    list_of_files = []

    for root, dirs, files in os.walk(file_dir):  
        #print(root) #当前目录路径  
        #print(dirs) #当前路径下所有子目录  
        #print(files) #当前路径下所有非目录子文件
        for file in files:
            #print(os.path.join(root, file))
            list_of_files.append(os.path.join(root, file))

    return list_of_files
        
def extract_to_summary(file_name, sheet, cols, sheet_num):

    data = xlrd.open_workbook(file_name)

    table = data.sheets()[0]
    # 当前表含有记录数
    rows_num = table.nrows - 1
    # 对需要提取列遍历
    for i in range(0, cols):
        col_data = table.col_values(i)[1:]
        # 写入汇总表中
        for row, col_cell_data in enumerate(col_data, 1 + rows_sum[sheet_num]):
            sheet.write(row, i, col_cell_data)

    rows_sum[sheet_num] += rows_num
    
# 主函数
if __name__=="__main__":

    print("===本程序将所有的NewStyle提取到一张水文数据汇总表上===")

    # 该目录下所有文件名
    list_of_files = []

    list_of_files = get_file_name(os.getcwd())

    # 输出
    # 输出表格总行数
    rows_sum = [0, 0, 0, 0, 0, 0]
    # 创建输出表
    output = xlwt.Workbook()

    sheet1 = output.add_sheet("逐日降水量")

    sheet2 = output.add_sheet("逐日平均水位")

    sheet3 = output.add_sheet("逐日水面蒸发量")

    sheet4 = output.add_sheet("降水量摘录")

    sheet5 = output.add_sheet("洪水水位摘录")

    sheet6 = output.add_sheet("洪水水文要素摘录")

    sheet1.write(0, 0, "STCD")

    sheet1.write(0, 1, "TMBEG")

    sheet1.write(0, 2, "TMEND")

    sheet1.write(0, 3, "R")

    sheet2.write(0, 0, "STCD")

    sheet2.write(0, 1, "TM")

    sheet2.write(0, 2, "Z")

    sheet3.write(0, 0, "STCD")

    sheet3.write(0, 1, "TM")

    sheet3.write(0, 2, "E")

    sheet4.write(0, 0, "STCD")

    sheet4.write(0, 1, "TMBEG")

    sheet4.write(0, 2, "TMEND")

    sheet4.write(0, 3, "R")

    sheet5.write(0, 0, "STCD")

    sheet5.write(0, 1, "TM")

    sheet5.write(0, 2, "Z")

    sheet6.write(0, 0, "STCD")

    sheet6.write(0, 1, "TM")

    sheet6.write(0, 2, "Z")

    sheet6.write(0, 3, "Q")

    list_of_cols = [4, 3, 3, 4, 3, 4]

    list_of_sheets = [sheet1, sheet2, sheet3, sheet4, sheet5, sheet6]

    list_of_sheet_names = ["逐日降水量表", "逐日平均水位表", "逐日水面蒸发量表", "降水量摘录表", "洪水水位摘录表", "洪水水文要素摘录表(二要素)"]

    for file_name in list_of_files:
        for index, sheet_name in enumerate(list_of_sheet_names):
            if sheet_name + "NewStyle" in file_name:
                extract_to_summary(file_name, list_of_sheets[index], list_of_cols[index], index)

    output.save("水文数据汇总表.xls")

    print("===转换完毕,请查看水文数据汇总表.xls文件===")    

所有文件目录

import xlrd
import xlwt
import os  

# 输出该目录下所有子文件目录  
def get_file_name(file_dir):
    list_of_files = []

    for root, dirs, files in os.walk(file_dir):  
        #print(root) #当前目录路径  
        #print(dirs) #当前路径下所有子目录  
        #print(files) #当前路径下所有非目录子文件
        for file in files:
            #print(os.path.join(root, file))
            list_of_files.append(file)

    return list_of_files
        

# 主函数
if __name__=="__main__":

    print("===打印目录中所有子文件===")

    # 该目录下所有文件名
    list_of_files = []

    list_of_files = get_file_name(os.getcwd())

    list_of_origin_files = [file for file in list_of_files if "NewStyle" not in file]

    print('\n'.join(list_of_origin_files))

    # 输出
    output = xlwt.Workbook()

    sheet1 = output.add_sheet("目录名")

    sheet1.write(0, 0, "目录名")

    for row, file in enumerate(list_of_origin_files, 1):
        sheet1.write(row, 0, file)

    output.save("目录名.xls")

数据文件及程序

本地目录:笔记本电脑D:\Projects\数据清洗\杭州南排\第一次任务\提供杭州林水局,办公室电脑E:\Projects\数据清洗\杭州南排

你可能感兴趣的:(#,Python,python,数据清洗,水文年鉴,通用)