to_sql是Pandas中用于将DataFrame数据写入数据库的方法,可以将DataFrame转换为SQL语句,方便我们将数据存入数据库中,以便进行后续的操作。
to_sql方法中包含多个参数,比较常用的参数有name(表名)、con(数据库连接对象)、if_exists(若表已经存在,进行何种操作)、index(将DataFrame的index列写入数据库中)等。
pandas.read_sql(sql, con, index_col=None, coerce_float=True, params=None, parse_dates=None, columns=None, chunksize=None, dtype_backend=_NoDefault.no_default, dtype=None)
Read SQL query or database table into a DataFrame.
pandas.read_sql — pandas 2.1.2 documentation
def to_sql(
frame,
name: str,
con,
schema: str | None = None,
if_exists: Literal["fail", "replace", "append"] = "fail",
index: bool = True,
index_label: IndexLabel | None = None,
chunksize: int | None = None,
dtype: DtypeArg | None = None,
method: Literal["multi"] | Callable | None = None,
engine: str = "auto",
**engine_kwargs,
) -> int | None:
"""
Write records stored in a DataFrame to a SQL database.
Parameters
----------
frame : DataFrame, Series
name : str
Name of SQL table.
con : SQLAlchemy connectable(engine/connection) or database string URI
or sqlite3 DBAPI2 connection
Using SQLAlchemy makes it possible to use any DB supported by that
library.
If a DBAPI2 object, only sqlite3 is supported.
schema : str, optional
Name of SQL schema in database to write to (if database flavor
supports this). If None, use default schema (default).
if_exists : {'fail', 'replace', 'append'}, default 'fail'
- fail: If table exists, do nothing.
- replace: If table exists, drop it, recreate it, and insert data.
- append: If table exists, insert data. Create if does not exist.
index : bool, default True
Write DataFrame index as a column.
index_label : str or sequence, optional
Column label for index column(s). If None is given (default) and
`index` is True, then the index names are used.
A sequence should be given if the DataFrame uses MultiIndex.
chunksize : int, optional
Specify the number of rows in each batch to be written at a time.
By default, all rows will be written at once.
dtype : dict or scalar, optional
Specifying the datatype for columns. If a dictionary is used, the
keys should be the column names and the values should be the
SQLAlchemy types or strings for the sqlite3 fallback mode. If a
scalar is provided, it will be applied to all columns.
method : {None, 'multi', callable}, optional
Controls the SQL insertion clause used:
- None : Uses standard SQL ``INSERT`` clause (one per row).
- ``'multi'``: Pass multiple values in a single ``INSERT`` clause.
- callable with signature ``(pd_table, conn, keys, data_iter) -> int | None``.
Details and a sample callable implementation can be found in the
section :ref:`insert method `.
engine : {'auto', 'sqlalchemy'}, default 'auto'
SQL engine library to use. If 'auto', then the option
``io.sql.engine`` is used. The default ``io.sql.engine``
behavior is 'sqlalchemy'
.. versionadded:: 1.3.0
**engine_kwargs
Any additional kwargs are passed to the engine.
Returns
-------
None or int
Number of rows affected by to_sql. None is returned if the callable
passed into ``method`` does not return an integer number of rows.
.. versionadded:: 1.4.0
Notes
-----
The returned rows affected is the sum of the ``rowcount`` attribute of ``sqlite3.Cursor``
or SQLAlchemy connectable. The returned value may not reflect the exact number of written
rows as stipulated in the
`sqlite3 `__ or
`SQLAlchemy `__
""" # noqa: E501
if if_exists not in ("fail", "replace", "append"):
raise ValueError(f"'{if_exists}' is not valid for if_exists")
if isinstance(frame, Series):
frame = frame.to_frame()
elif not isinstance(frame, DataFrame):
raise NotImplementedError(
"'frame' argument should be either a Series or a DataFrame"
)
import pandas as pd
import pyodbc
import openpyxl
from sqlalchemy import create_engine
# Connection parameters
server = 'localhost'
database = 'tsl'
username = 'sa'
password = 'lqxxx'
# Create a SQLAlchemy engine
engine = create_engine(f"mssql+pyodbc://{username}:{password}@{server}/{database}?driver=ODBC Driver 17 for SQL Server")
#设置文件目录
filePath = r"C:\\Users\\Administrator\\Documents\\traindata20221231.xlsx"
#读取excel文件"明细"页签数据
table = pd.read_excel(filePath,sheet_name="Sheet0")
print(table.info())
#连接测试,验证能否连通
try:
pd.read_sql('Employees', con=engine); print("connect successfully!")
except Exception as error:
print("connect fail! because of :", error)
# import time
# T1 = time.time()
# #用to_sql()方法插入数据,if_exists参数值:"replace"表示如果表存在, 则删掉重建该表, 重新创建;"append"表示如果表存在, 则会追加数据。
# try:
# table.to_sql("trading", con=engine, index=False, if_exists="replace");
# print("insert successfully!")
# except Exception as error:
# print("insert fail! because of:", error)
# print("data write complete!")
# T2 = time.time()
# print('程序运行时间:%s毫秒' % ((T2 - T1)*1000))
#
# RangeIndex: 10233 entries, 0 to 10232
# Data columns (total 11 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 tradingHours 10233 non-null object
# 1 tradingChannel 10233 non-null object
# 2 currencyType 10233 non-null object
# 3 changeInto 10233 non-null float64
# 4 changeOut 10233 non-null float64
# 5 balance 10233 non-null float64
# 6 tradingName 10141 non-null object
# 7 tradingAccount 10153 non-null object
# 8 paymentMethod 10233 non-null object
# 9 postscript 8099 non-null object
# 10 summary 916 non-null object
# dtypes: float64(3), object(8)
# memory usage: 879.5+ KB
# None
# connect successfully!
# insert successfully!
# data write complete!
# 程序运行时间:20926.252126693726毫秒
# [Finished in 39.9s]
若数据库表已存在,且没有指定if_exists参数,则to_sql方法默认行为为追加数据,即写入新数据时不会覆盖原有数据。此时需要注意数据重复问题。
to_sql方法写入大量数据时,可能会导致内存不足,需要使用chunksize参数进行分批写入。
to_sql方法写入数据时,默认使用pandas.DataFrame.to_sql(),可能存在性能问题