[2022.07.23]
环境 | 说明 |
---|---|
数据库 | MySQL 8.0.28 |
OS_1 | Ubuntu (内核5.16.17) |
OS_2 | CentOS-7-x86_64-1810 |
连接工具 | XShell7, Xftp |
sudo apt-get install mysql-server
安装后直接使用-uroot -p是无法登录MySQL的
[root@dataserver ~]# mysql -uroot -p
Enter password:
ERROR 1045 (28000): Access denied for user 'root'@'localhost' (using password: YES)
因为此时使用的是随机的默认密码。
[mysqld]
skip-grant-tables
[root@dataserver ~]# service mysqld restart
Redirecting to /bin/systemctl restart mysqld.service
[root@dataserver ~]# mysql -uroot
Welcome to the MySQL monitor. Commands end with ; or \g.
Your MySQL connection id is 7
Server version: 8.0.28 MySQL Community Server - GPL
mysql> use mysql
Reading table information for completion of table and column names
You can turn off this feature to get a quicker startup with -A
Database changed
mysql> set global validate_password.policy=LOW;
Query OK, 0 rows affected (0.00 sec)
mysql> flush privileges;
Query OK, 0 rows affected (0.00 sec)
mysql> set global validate_password.length = 6;
Query OK, 0 rows affected (0.00 sec)
mysql> flush privileges;
Query OK, 0 rows affected (0.00 sec)
mysql> alter user 'root'@'localhost' identified by 'your_password';
Query OK, 0 rows affected (0.00 sec)
systemctl restart mysql
使用新密码登录
[root@dataserver ~]# mysql -uroot -p
Enter password:
Welcome to the MySQL monitor. Commands end with ; or \g.
Your MySQL connection id is 9
Server version: 8.0.28 MySQL Community Server - GPL
允许远程主机连接
mysql> use mysql;
Reading table information for completion of table and column names
You can turn off this feature to get a quicker startup with -A
Database changed
mysql> update user set host = ‘%' where user =’root';
ERROR 1054 (42S22): Unknown column '‘' in 'field list'
mysql> update user set host = '%' where user ='root';
Query OK, 1 row affected (0.00 sec)
Rows matched: 1 Changed: 1 Warnings: 0
mysql> flush privileges;
Query OK, 0 rows affected (0.01 sec)
修改配置文件 /etc/mysql/mysql.conf.d/mysqld.cnf
[mysqld]
bind-address = 0.0.0.0
port = 3306
测试是否能进行远程连接。
# 临时关闭防火墙
systemctl stop firewalld
# 永久关闭防火墙
systemctl disable firewalld
# 临时关闭 SElinux
setenforce 0
# 永久关闭 SElinux
# 修改/etc/selinux/config
SELINUX=disabled
# 解压缩
tar -xvf /opt/mysql-8.0.28-1.el7.aarch64.rpm-bundle.tar
# 无视依赖警告安装
rpm -Uvh *.rpm --nodeps --force
# 启动mysql服务
systemctl restart mysqld
基本与ubuntu版本相同,不过要注意的是配置文件为 /etc/my.cnf
需要自己添加配置项。
环境配置及数据集处理参见前篇
基于pandas的简单数据挖掘程序构造(一)数据集的基本处理
直接使用dataframe类的to_sql()方法写入
import pymysql
import sqlalchemy
from sqlalchemy import create_engine
def sql_write(data_frame):
print('====[Writing...]====')
# 配置数据库连接
engine = create_engine('mysql+pymysql://root:password@ip_address:3306/DKA_data?charset=utf8')
# if_exists='replace': 如果同名表已存在,则取代
# dtype={...}: 指定'时间戳'列类型为DATETIME
data_frame.to_sql(name='result_table', con=engine, index=False, if_exists='replace',
dtype={'时间戳': sqlalchemy.DATETIME})
整数形式的时间戳有写入问题,类型修改为datetime,完整源码修改为
# 22.7.16
import pandas as pd
import time
import datetime
from multiprocessing import Pool, cpu_count
import pymysql
import sqlalchemy
from sqlalchemy import create_engine
# 读取并格式化为dataframe
def dka_format(csv_file):
csv_data = pd.read_csv(csv_file,
# skiprows=[i for i in range(1, 54000)],
# nrows=1000,
dtype={'有功功率传输 (kWh)': 'float32', '相电流 (A)': 'float32',
'有功功率 (kW)': 'float32',
'功率因数 (%)': 'float32', '线电压 (V)': 'float32', '频率 (Hz)': 'float32',
'总谐波失真电流 (%)': 'float32', '总谐波失真电压 (%)': 'float32', '风速 (m/s)': 'float32',
'温度 (°C)': 'float32', '相对湿度 (%)': 'float32', '总辐射 (W/m²)': 'float32',
'日射量 (W/m²)': 'float32', '风向 (Degrees)': 'float32', '每日降雨 (mm)': 'float32'})
df_read = csv_data.copy()
# 时间数据转为时间戳格式
# 方法一
# for i in read_data.index:
# read_data.loc[i, '时间戳'] = int(time.mktime(time.strptime(read_data.loc[i, '时间戳'], '%Y-%m-%d %H:%M:%S')))
# read_data['时间戳'] = read_data['时间戳'].astype(int)
# 方法二
df_read['时间戳'] = pd.to_datetime(df_read['时间戳'], format='%Y-%m-%d %H:%M:%S')
# 修正过高精度和时间偏移
# df_read['时间戳'] = ((df_read['时间戳'].apply(lambda x: x.value)) / 1000000000) - 28800
# df_read['时间戳'] = df_read['时间戳'].astype(int)
return df_read
# 检查是否有整行缺失
def dka_check_row(csv_frame):
print('====[Checking Rows...]====')
missed_row = {}
missed_part = []
for i in csv_frame.index:
if i > 0:
time_before = csv_frame['时间戳'][i - 1]
time_after = csv_frame['时间戳'][i]
difference = (time_after - time_before)
# 检查时间差,若超过300s(5min),即判断有行缺失
if difference > datetime.timedelta(minutes=5):
missed_row['row'] = i
missed_row['timestamp'] = csv_frame['时间戳'][i - 1]
missed_row['difference'] = difference
missed_part.append(missed_row.copy())
# 返回包含缺失行信息的列表
return missed_part
# 重建缺失行(仅包含时间戳,其余为空)
def dka_rebuild_rows(csv_frame, missed_part):
df_rebuild = csv_frame.copy()
if missed_part:
print('====[Rebuilding Rows...]====')
for m in missed_part:
row_begin = m['row']
# 缺失点行(缺失行的后临接行)
time_begin = m['timestamp']
stamp_num = m['difference'] / datetime.timedelta(minutes=5)
stamp_list = []
while stamp_num > 1:
time_begin = time_begin + datetime.timedelta(minutes=5)
stamp_list.append(time_begin)
stamp_num = stamp_num - 1
columns = csv_frame.columns
# 将缺失点前条目作为切片 df1(不包含缺失点行)
df1 = df_rebuild[0:row_begin][columns]
# 将缺失点后条目作为切片 df2(包含缺失点行)
df2 = df_rebuild[row_begin:][columns]
# 将缺失行的时间戳列表构造为df_add,列名'时间戳'
df_add = pd.DataFrame({'时间戳': stamp_list}, dtype='int32')
# 将df_add,df2顺序连接到df1后,构造为df_rebuild
df_rebuild = df1.append([df_add, df2], ignore_index=True)
# 返回构造好的dataframe
return df_rebuild
# 对空值进行插值补全
# 传入series类
def dka_process(frame_series):
# 排除项(时间戳、每日降雨)
if frame_series.name == '时间戳' or frame_series.name == '每日降雨 (mm)':
return frame_series
else:
print('====[', frame_series.name, ' Processing...]====')
# interpolate()方法对空值进行插值补全,采用spline方法(三次样条插值)
frame_series = frame_series.interpolate(method='spline', order=3)
print('====[', frame_series.name, ' Done.]====')
# 返回插值完成的series
return frame_series
# 对空值进行多进程处理(计算密集)
def dka_interpolate(dka_frame):
print('====[Interpolating...]====')
# 构造要处理的字段名列表
column_list = []
for r in dka_frame:
column_list.append(r)
del column_list[0]
del column_list[-1]
# 按列拆分为series
print('====[Cutting...]====')
# 构造完整的字段名列表
columns = dka_frame.columns
# 按字段名(列名)拆分为series后加入打包列表series_pkg
series_pkg = []
for c in columns:
series_pkg.append(dka_frame[c])
# 配置进程池进程上限=要处理的字段名列表长度
p = Pool(len(column_list))
# starmap_async()方法进行多进程异步处理,打包列表series_pkg作为参数
# 返回一个包含结果列表的MapResult对象,用get()方法取出列表(顺序与输入参数的列表相同)
series_list = p.starmap_async(dka_process, zip(series_pkg)).get()
p.close()
p.join()
print('====[Connecting...]====')
# 将返回的列表中的处理完成的series按列按顺序连接为dataframe
df_interpolated = pd.concat(series_list, axis=1, ignore_index=False)
print(df_interpolated.info())
return df_interpolated
def sql_write(data_frame):
print('====[Writing...]====')
engine = create_engine('mysql+pymysql://root:password@ip_address:3306/DKA_data?charset=utf8')
data_frame.to_sql(name='result_table', con=engine, index=False, if_exists='replace',
dtype={'时间戳': sqlalchemy.DATETIME})
if __name__ == '__main__':
start = time.perf_counter()
file = '96-Site_DKA-MasterMeter1_piece_2017.csv'
dka_frame = dka_format(file)
check_result = dka_check_row(dka_frame)
if check_result:
for i in check_result:
print('[Missed] | ' + str(i['row']) + ': ' + str(i['timestamp']) + ' with ' + str(i['difference']))
df_row_rebuilt = dka_rebuild_rows(dka_frame, check_result)
try:
print(df_row_rebuilt.info())
except NameError:
print('[Info] | No missed row')
rebuilt_done_frame = dka_interpolate(df_row_rebuilt)
print(rebuilt_done_frame.info())
end = time.perf_counter()
sql_write(rebuilt_done_frame)
print("运行时间为", round(end - start), 'seconds')