COPY INTO [dbo].[target_table]
FROM 'https://.blob.core.windows.net//'
WITH (
FILE_TYPE = 'CSV',
FIELDTERMINATOR = ',',
ROWTERMINATOR = '\n',
CREDENTIAL = (IDENTITY= 'ManagedIdentity')
);
一、不同的数据源格式
Parquet格式
COPY INTO [dbo].[target_table]
FROM 'https://.blob.core.windows.net//'
WITH (
FILE_TYPE = 'PARQUET',
CREDENTIAL = (IDENTITY= 'ManagedIdentity')
);
JSON格式(假设SQL Pool支持合适的JSON处理)
COPY INTO [dbo].[target_table]
FROM 'https://.blob.core.windows.net//'
WITH (
FILE_TYPE = 'JSON',
CREDENTIAL = (IDENTITY= 'ManagedIdentity')
);
二、不同的身份验证方式(除了ManagedIdentity)
SQL登录身份验证(如果适用)
COPY INTO [dbo].[target_table]
FROM 'https://.blob.core.windows.net//'
WITH (
FILE_TYPE = 'CSV',
FIELDTERMINATOR = ',',
ROWTERMINATOR = '\n',
CREDENTIAL = (USERNAME='' , PASSWORD='' )
);
三、处理不同的文件路径和通配符情况
加载文件夹下所有文件(使用通配符)
COPY INTO [dbo].[target_table]
FROM 'https://.blob.core.windows.net///*'
WITH (
FILE_TYPE = 'CSV',
FIELDTERMINATOR = ',',
ROWTERMINATOR = '\n',
CREDENTIAL = (IDENTITY= 'ManagedIdentity')
);
加载特定文件名模式的文件(使用通配符)
COPY INTO [dbo].[target_table]
FROM 'https://.blob.core.windows.net///data_*.csv'
WITH (
FILE_TYPE = 'CSV',
FIELDTERMINATOR = ',',
ROWTERMINATOR = '\n',
CREDENTIAL = (IDENTITY= 'ManagedIdentity')
);
四、数据转换和映射选项(如果支持)
指定列映射(假设目标表和源数据列不完全匹配)
COPY INTO [dbo].[target_table] (column1, column2, column3)
FROM 'https://.blob.core.windows.net//'
WITH (
FILE_TYPE = 'CSV',
FIELDTERMINATOR = ',',
ROWTERMINATOR = '\n',
CREDENTIAL = (IDENTITY= 'ManagedIdentity')
);
应用简单的数据转换函数(例如,将源数据中的字符串日期转换为日期类型)
COPY INTO [dbo].[target_table]
FROM 'https://.blob.core.windows.net//'
WITH (
FILE_TYPE = 'CSV',
FIELDTERMINATOR = ',',
ROWTERMINATOR = '\n',
CREDENTIAL = (IDENTITY= 'ManagedIdentity'),
DATA_CONVERSION = 'column_date = CONVERT(DATE, source_date_column)'
);
FILE_TYPE 参数的更多选项
COPY INTO [dbo].[target_table]
FROM 'https://.blob.core.windows.net//'
WITH (
FILE_TYPE = 'ORC',
CREDENTIAL = (IDENTITY = 'ManagedIdentity')
);
COPY INTO [dbo].[target_table]
FROM 'https://.blob.core.windows.net//'
WITH (
FILE_TYPE = 'DELTA',
CREDENTIAL = (IDENTITY = 'ManagedIdentity')
);
FIELDQUOTE 参数(用于带引号的字段,如CSV文件中引号包围的字段)
COPY INTO [dbo].[target_table]
FROM 'https://.blob.core.windows.net//'
WITH (
FILE_TYPE = 'CSV',
FIELDTERMINATOR = ',',
FIELDQUOTE = '"',
ROWTERMINATOR = '\n',
CREDENTIAL = (IDENTITY = 'ManagedIdentity')
);
COMPRESSION 参数(如果数据源是压缩文件)
COPY INTO [dbo].[target_table]
FROM 'https://.blob.core.windows.net//.gz'
WITH (
FILE_TYPE = 'CSV',
FIELDTERMINATOR = ',',
ROWTERMINATOR = '\n',
COMPRESSION = 'GZIP',
CREDENTIAL = (IDENTITY = 'ManagedIdentity')
);
HEADER_ROW 参数(用于处理包含标题行的文件)
COPY INTO [dbo].[target_table]
FROM 'https://.blob.core.windows.net//'
WITH (
FILE_TYPE = 'CSV',
FIELDTERMINATOR = ',',
ROWTERMINATOR = '\n',
HEADER_ROW = TRUE,
CREDENTIAL = (IDENTITY = 'ManagedIdentity')
);
ERRORFILE 参数(用于指定错误文件路径)
COPY INTO [dbo].[target_table]
FROM 'https://.blob.core.windows.net//'
WITH (
FILE_TYPE = 'CSV',
FIELDTERMINATOR = ',',
ROWTERMINATOR = '\n',
CREDENTIAL = (IDENTITY = 'ManagedIdentity'),
ERRORFILE = 'https://.blob.core.windows.net//'
);
CTAS
或 INSERT INTO
快速从外部数据源加载数据。-- CTAS 语句示例
CREATE TABLE [dbo].[target_table] AS
SELECT *
FROM EXTERNAL DATA SOURCE [AzureBlobStorage]
WITH (
LOCATION = 'https://.blob.core.windows.net//' ,
FILE_FORMAT = 'CSV'
);
-- INSERT INTO 语句示例
INSERT INTO [dbo].[target_table]
SELECT *
FROM EXTERNAL TABLE [external_table];
以下是不同参数的PolyBase命令相关内容:
一、使用 INSERT INTO 加载数据
-- INSERT INTO语句示例
INSERT INTO [dbo].[target_table]
SELECT *
FROM EXTERNAL DATA SOURCE [AzureBlobStorage]
WITH (
LOCATION = 'https://.blob.core.windows.net//' ,
FILE_FORMAT = 'CSV'
);
在这个命令中, INSERT INTO 用于将从外部数据源(这里是Azure Blob Storage指定位置的文件)查询到的数据插入到已存在的 target_table 表中。
二、指定不同文件格式参数
Parquet文件格式
-- CTAS语句示例(Parquet格式)
CREATE TABLE [dbo].[parquet_target_table] AS
SELECT *
FROM EXTERNAL DATA SOURCE [AzureBlobStorage]
WITH (
LOCATION = 'https://.blob.core.windows.net//' ,
FILE_FORMAT = 'PARQUET'
);
这里将文件格式参数 FILE_FORMAT 指定为 PARQUET ,用于加载Parquet格式的数据到新创建的表 parquet_target_table 中。
JSON文件格式(假设支持)
-- CTAS语句示例(JSON格式,假设支持)
CREATE TABLE [dbo].[json_target_table] AS
SELECT *
FROM EXTERNAL DATA SOURCE [AzureBlobStorage]
WITH (
LOCATION = 'https://.blob.core.windows.net//' ,
FILE_FORMAT = 'JSON'
);
此命令尝试将 FILE_FORMAT 设为 JSON 来加载JSON格式的数据到 json_target_table 表,不过实际的JSON支持可能需要根据Azure Synapse Dedicated SQL Pool的具体配置和功能来确定。
三、指定数据源认证相关参数(如果需要)
假设数据源需要某种形式的认证,可能会有类似如下的参数(以下是示例,实际认证方式因数据源和配置而异):
-- CTAS语句示例(包含认证相关参数示例)
CREATE TABLE [dbo].[target_table_with_auth] AS
SELECT *
FROM EXTERNAL DATA SOURCE [AzureBlobStorage]
WITH (
LOCATION = 'https://.blob.core.windows.net//' ,
FILE_FORMAT = 'CSV',
CREDENTIAL = [MyCredential] -- 假设MyCredential是已配置的认证凭据对象
);
在这个示例中, CREDENTIAL 参数用于指定访问外部数据源所需的认证凭据,这可以用于安全地访问需要认证的存储资源。
-- 示例:从 Dedicated SQL Pool 导出数据到 Azure Blob 存储
EXPORT TO EXTERNAL DATA SOURCE [AzureBlobStorage]
WITH (
LOCATION = 'https://.blob.core.windows.net//' ,
FILE_FORMAT = 'CSV'
)
SELECT * FROM [dbo].[source_table];
以下是 PolyBase 导出命令中不同参数的一些情况:
EXPORT TO EXTERNAL DATA SOURCE [ADLSGen2]
WITH (
LOCATION = 'abfss://@.dfs.core.windows.net/' ,
FILE_FORMAT = 'PARQUET'
)
SELECT * FROM [dbo].[source_table];
-- 假设已经配置好链接服务器等相关设置
EXPORT TO EXTERNAL DATA SOURCE [RemoteSQLServer]
WITH (
LOCATION = 'Server=;Database=;Schema=;Table=' ,
FILE_FORMAT = 'ORC'
)
SELECT * FROM [dbo].[source_table];
EXPORT TO EXTERNAL DATA SOURCE [AzureBlobStorage]
WITH (
LOCATION = 'https://.blob.core.windows.net//' ,
FILE_FORMAT = 'JSON'
)
SELECT * FROM [dbo].[source_table];
EXPORT TO EXTERNAL DATA SOURCE [AzureBlobStorage]
WITH (
LOCATION = 'https://.blob.core.windows.net//' ,
FILE_FORMAT = 'AVRO'
)
SELECT * FROM [dbo].[source_table];
EXPORT TO EXTERNAL DATA SOURCE [AzureBlobStorage]
WITH (
LOCATION = 'https://.blob.core.windows.net//' ,
FILE_FORMAT = 'CSV'
)
SELECT column1, column2 FROM [dbo].[source_table];
EXPORT TO EXTERNAL DATA SOURCE [AzureBlobStorage]
WITH (
LOCATION = 'https://.blob.core.windows.net//' ,
FILE_FORMAT = 'CSV'
)
SELECT UPPER(column_name) FROM [dbo].[source_table];
EXPORT TO EXTERNAL DATA SOURCE [AzureBlobStorage]
WITH (
LOCATION = 'https://.blob.core.windows.net//' ,
FILE_FORMAT = 'CSV'
)
SELECT * FROM [dbo].[source_table] WHERE column_value > 10;
EXPORT TO EXTERNAL DATA SOURCE [AzureBlobStorage]
WITH (
LOCATION = 'https://.blob.core.windows.net//' ,
FILE_FORMAT = 'CSV'
)
SELECT * FROM [dbo].[source_table] WHERE (column_value > 10 AND column_name LIKE '%abc%');
CREATE TABLE [dbo].[new_table] (
column1 INT,
column2 VARCHAR(100),
column3 DATETIME
)
WITH (
DISTRIBUTION = HASH(column1), -- 指定分布方法
CLUSTERED COLUMNSTORE INDEX -- 使用聚集列存储索引
);
在Azure Synapse Dedicated SQL Pool中,除了上述基本的创建表命令形式外,还有以下几种常见形式:
不指定分布方法(采用默认分布)
CREATE TABLE [dbo].[new_table] (
column1 INT,
column2 VARCHAR(100),
column3 DATETIME
)
WITH (
CLUSTERED COLUMNSTORE INDEX
);
这种情况下,系统会根据默认设置来确定数据的分布方式。
创建堆表(无聚集索引)
CREATE TABLE [dbo].[new_table] (
column1 INT,
column2 VARCHAR(100),
column3 DATETIME
);
堆表没有聚集索引,数据存储时不会按照特定的索引结构排序,在某些特定场景下(如快速插入大量数据)可能会用到。
基于现有表创建新表(仅复制结构)
SELECT TOP 0 * INTO [dbo].[new_table] FROM [dbo].[existing_table];
这会创建一个与 existing_table 结构相同的新表 new_table ,但是不会复制数据,只有表结构被复制过来。
带有约束条件的表创建
CREATE TABLE [dbo].[new_table] (
column1 INT PRIMARY KEY,
column2 VARCHAR(100) NOT NULL,
column3 DATETIME CHECK (column3 > '2025-01-01')
)
WITH (
DISTRIBUTION = HASH(column1),
CLUSTERED COLUMNSTORE INDEX
);
这里的 column1 被定义为主键, column2 设置为非空, column3 添加了一个检查约束,只允许插入大于 2025-01-01 的日期值。
CREATE TABLE [dbo].[new_table] (
column1 INT,
column2 VARCHAR(100),
column3 DATETIME
)
WITH (
DISTRIBUTION = ROUND_ROBIN,
CLUSTERED COLUMNSTORE INDEX
);
ROUND_ROBIN分布是将数据均匀地分布在各个计算节点上,不依赖于某一列的值。
CREATE TABLE [dbo].[new_table] (
column1 INT,
column2 VARCHAR(100),
column3 DATETIME
)
WITH (
DISTRIBUTION = REPLICATE,
CLUSTERED COLUMNSTORE INDEX
);
REPLICATE分布会将整个表的数据复制到每个计算节点,适用于小表,方便在各个节点本地访问,减少数据移动开销。
CREATE TABLE [dbo].[new_table] (
column1 INT,
column2 VARCHAR(100),
column3 DATETIME
)
WITH (
DISTRIBUTION = HASH(column1),
NONCLUSTERED COLUMNSTORE INDEX(column1, column2)
);
非聚集列存储索引可以对指定列进行索引,在某些查询场景下提供更好的性能。
CREATE TABLE [dbo].[new_table] (
column1 INT,
column2 VARCHAR(100),
column3 DATETIME
)
WITH (
DISTRIBUTION = HASH(column1),
CLUSTERED INDEX (column1)
);
聚集索引会根据索引列对数据进行物理排序存储,与列存储索引的存储和使用方式有所不同,适合特定的查询模式,例如基于范围的查询。
CREATE TABLE [dbo].[new_table] (
column1 INT,
column2 VARCHAR(100),
column3 DATETIME
)
WITH (
DISTRIBUTION = HASH(column1),
CLUSTERED COLUMNSTORE INDEX,
PARTITION (column3 RANGE RIGHT FOR VALUES ('2025-01-01', '2025-02-01'))
);
这个命令创建了一个分区表,根据 column3 列的值进行分区,这里使用了 RANGE RIGHT 分区方式,将数据按照给定的日期值分区存储,有助于提高数据管理和查询性能,特别是针对时间序列等数据。
-- 添加列
ALTER TABLE [dbo].[existing_table]
ADD new_column INT;
-- 更改分布方法
ALTER TABLE [dbo].[existing_table]
WITH (DISTRIBUTION = REPLICATED);
DROP TABLE [dbo].[table_name];
基本的删除表命令
DECLARE @tableName NVARCHAR(100)='[dbo].[dynamic_table]';
DECLARE @sql NVARCHAR(MAX);
SET @sql = 'DROP TABLE ' + @tableName;
EXEC sp_executesql @sql;
IF OBJECT_ID('dbo.table_to_check', 'U') IS NOT NULL
BEGIN
DROP TABLE [dbo].[table_to_check];
END;
CREATE CLUSTERED COLUMNSTORE INDEX CCI_Index
ON [dbo].[table_name];
DROP INDEX [index_name] ON [dbo].[table_name];
CREATE MATERIALIZED VIEW [dbo].[materialized_view_name]
AS
SELECT column1, column2, SUM(column3) AS total
FROM [dbo].[table_name]
GROUP BY column1, column2;
物化视图命令的基本形式
CREATE MATERIALIZED VIEW [schema_name].[materialized_view_name]
AS
[SELECT query];
其中 schema_name 是架构名, materialized_view_name 是物化视图名称, SELECT query 是用于定义物化视图内容的查询语句,如示例中的按 column1 和 column2 分组并计算 column3 总和的查询。
ALTER MATERIALIZED VIEW [dbo].[materialized_view_name] REFRESH;
这用于更新物化视图中的数据,使其反映基础表数据的最新变化。
DROP MATERIALIZED VIEW [dbo].[materialized_view_name];
用于删除不再需要的物化视图。
不同参数的物化视图命令(以创建为例)
CREATE MATERIALIZED VIEW [dbo].[materialized_view_name]
WITH (DISTRIBUTION = HASH([distribution_column]))
AS
[SELECT query];
这里 DISTRIBUTION = HASH([distribution_column]) 指定了分布方式,通过对 distribution_column 进行哈希分布来存储物化视图数据,提升查询性能。
CREATE MATERIALIZED VIEW [dbo].[materialized_view_name]
WITH (INDEX = ([index_name]))
AS
[SELECT query];
其中 INDEX = ([index_name]) 可以为物化视图创建索引, index_name 是索引名称,合理的索引可以加快查询物化视图的速度。不过需要注意的是,这种方式会增加存储和维护成本。
-- 示例:重写一个查询以提高性能
SELECT a.column1, b.column2
FROM [dbo].[table_a] a
INNER JOIN [dbo].[table_b] b
ON a.column1 = b.column1;
-- 使用 HASH 分布
CREATE TABLE [dbo].[table_name]
WITH (DISTRIBUTION = HASH(column1));
-- 使用 REPLICATED 分布
CREATE TABLE [dbo].[table_name]
WITH (DISTRIBUTION = REPLICATED);
-- 使用 ROUND-ROBIN 分布
CREATE TABLE [dbo].[table_name]
WITH (DISTRIBUTION = ROUND_ROBIN);
-- 为当前会话设置资源类
EXEC sp_set_session_context 'resource_class', 'largerc';
-- 配置查询超时(单位:秒)
ALTER DATABASE SCOPED CONFIGURATION SET QUERY_TIMEOUT = 120;
-- 配置最大并发查询数
ALTER DATABASE SCOPED CONFIGURATION SET MAX_CONCURRENCY = 40;
Azure Synapse Studio 提供了查询性能见解的图形化界面,用户可以通过此界面监控查询性能,识别性能瓶颈,分析查询计划等。
-- 查看查询执行统计信息
SELECT * FROM sys.dm_pdw_exec_requests;
-- 查看查询等待统计信息
SELECT * FROM sys.dm_pdw_waits;
SELECT * FROM sys.dm_exec_query_plan( <request_id> );