From e3e7f606a48d7c8061d8b1ecfd061238a43ec8f7 Mon Sep 17 00:00:00 2001 From: Grace Date: Mon, 5 Jul 2021 13:40:55 +0800 Subject: [PATCH 01/25] Add entry in parent content directory, and minor format fix --- Pipelines/ImportADFtoSynapse/README.md | 12 +++++------- README.md | 4 ++++ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/Pipelines/ImportADFtoSynapse/README.md b/Pipelines/ImportADFtoSynapse/README.md index c795dbe..68c1bc0 100644 --- a/Pipelines/ImportADFtoSynapse/README.md +++ b/Pipelines/ImportADFtoSynapse/README.md @@ -61,14 +61,12 @@ You can get the Resource IDs for Azure Data Factory and Azure Synapse Analytics. .\importADFtoSynapseTool.ps1 -ConfigFile appsettings.json ``` -[!NOTE] -* Existing resources in destination workspace with the same name will be overwritten. -* Same networking setting between Azure Data Factory and Synapse is required. +>[!NOTE] +> * Existing resources in destination workspace with the same name will be overwritten. +> * Same networking setting between Azure Data Factory and Synapse is required. e.g. Managed Virtual Network of Azure Data Factory is enabled on both Azure Data Factory and Synapse. -* The migration tool does not support migration of ADF SSIS pipeline -* Refer to the [Troubleshooting Guide](./Troubleshooting.md) if you run into any issues when using the migration PowerShell script - -* * * +> * The migration tool does not support migration of ADF SSIS pipeline +> * Refer to the [Troubleshooting Guide](./Troubleshooting.md) if you run into any issues when using the migration PowerShell script ## How do I exclude specific objects from my Data Factory source factory? This migration tool will migrate all objects from the published version of your factory that is live in Azure. You will have to remove any objects that you do not wish to migrate to your Synapse workspace. If you do not wish to modify your source ADF, then you should make a copy of the existing factory, remove the objects you do not wish to migrate, and use that new factory as your source. diff --git a/README.md b/README.md index 90eb451..34de4e3 100644 --- a/README.md +++ b/README.md @@ -31,5 +31,9 @@ Shows .NET for Spark and shared metadata experience between Spark created tables * [Description and notebooks/code files](Notebooks/Spark.NET%20C%23/Tweets) * [Sample Data](Data/Tweets)) +### ADF to Synapse Migration Tool + +The [ADF to Synapse Migration Tool](Pipelines/ImportADFtoSynapse) (currently PowerShell scripts) enables you to migrate Azure Data Factory pipelines, datasets, linked service, integration runtime and triggers to a Synapse Analytics Workspace. + ## Contributing This project welcomes contributions and suggestions. See the [Contributor's guide](https://github.com/Azure-Samples/Synapse/tree/master/CONTRIBUTE.md) From e8697fea38253e2826d137c22846d939911287a3 Mon Sep 17 00:00:00 2001 From: Kaiyue Zhou <70192016+kaiyuezhou@users.noreply.github.com> Date: Wed, 14 Jul 2021 15:48:24 +0800 Subject: [PATCH 02/25] Update Spark application Log Analytics diagnostic workbook (#110) * Fix chart titles and some queries * Update Azure_Synapse_Spark_Application.workbook --- .../Azure_Synapse_Spark_Application.workbook | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/Diagnostic/LogAnalytics/Azure_Synapse_Spark_Application.workbook b/Diagnostic/LogAnalytics/Azure_Synapse_Spark_Application.workbook index 8ab6efd..9da139e 100644 --- a/Diagnostic/LogAnalytics/Azure_Synapse_Spark_Application.workbook +++ b/Diagnostic/LogAnalytics/Azure_Synapse_Spark_Application.workbook @@ -109,7 +109,8 @@ "type": 2, "description": "Spark Application Livy Id and Name", "isRequired": true, - "query": "SparkMetrics_CL\r\n| where workspaceName_s == \"{Workspace}\" and clusterName_s == \"{SparkPool}\" and isnotempty(livyId_s)\r\n| summarize by livyId_s, applicationName_s\r\n| order by livyId_s desc\r\n| extend applicationName = substring(applicationName_s, 0, strlen(applicationName_s) - 1)\r\n| project value = livyId_s, label = strcat(livyId_s, \" | \", applicationName), selected = false", + "query": "SparkMetrics_CL\r\n| where workspaceName_s == \"{Workspace}\" and clusterName_s == \"{SparkPool}\" and isnotempty(livyId_s)\r\n| extend applicationName = column_ifexists(\"applicationName_s\", applicationId_s)\r\n| summarize by livyId_s, applicationName\r\n| order by livyId_s desc\r\n| project value = livyId_s, label = strcat(livyId_s, \" | \", applicationName), selected = false", + "value": null, "typeSettings": { "additionalResourceOptions": [], "showDefault": false @@ -182,7 +183,7 @@ "id": "9246e88b-9682-498c-b440-f53a15b6e481", "cellValue": "selectedTab", "linkTarget": "parameter", - "linkLabel": "Spark Streaming", + "linkLabel": "Streaming", "subTarget": "Streaming", "preText": "Metrics", "postText": "1", @@ -255,7 +256,7 @@ "additionalResourceOptions": [ "value::all" ], - "selectAllValue": "ALL", + "selectAllValue": "All", "showDefault": false }, "timeContext": { @@ -277,7 +278,7 @@ "comparison": "isEqualTo", "value": "Logs" }, - "name": "CustomMetricsSelector - Copy" + "name": "Logs Filter Dropdown" }, { "type": 3, @@ -566,7 +567,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "let Data = SparkListenerEvent_CL\r\n| where workspaceName_s == \"{Workspace}\" and clusterName_s == \"{SparkPool}\" and livyId_s == \"{LivyId}\";\r\n\r\nunion\r\n(Data\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent\"\r\n| project TimeGenerated, progress_sources=parse_json(progress_sources_s)\r\n| mv-expand progress_sources\r\n| project TimeGenerated, InputRows=tolong(progress_sources.numInputRows)\r\n| summarize Value=sum(InputRows)\r\n| extend Name=\"Total Input Rows\", Order=0\r\n),\r\n(Data\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent\"\r\n| count as Value\r\n| extend Name=\"Total Query Progress Events\", Order=1\r\n),\r\n(Data\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryStartedEvent\"\r\n| count as Value\r\n| extend Name=\"Total Query Starts\", Order=2\r\n),\r\n(Data\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryTerminatedEvent\"\r\n| count as Value\r\n| extend Name=\"Total Query Terminateds\", Order=3\r\n)\r\n", + "query": "let Data = SparkListenerEvent_CL\r\n| where workspaceName_s == \"{Workspace}\" and clusterName_s == \"{SparkPool}\" and livyId_s == \"{LivyId}\";\r\n\r\nunion\r\n(Data\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent\"\r\n| project TimeGenerated, progress_sources=parse_json(column_ifexists(\"progress_sources_s\", \"{}\"))\r\n| mv-expand progress_sources\r\n| project TimeGenerated, InputRows=tolong(progress_sources.numInputRows)\r\n| summarize Value=sum(InputRows)\r\n| extend Name=\"Total Input Rows\", Order=0\r\n),\r\n(Data\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent\"\r\n| count as Value\r\n| extend Name=\"Total Query Progress Events\", Order=1\r\n),\r\n(Data\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryStartedEvent\"\r\n| count as Value\r\n| extend Name=\"Total Query Starts\", Order=2\r\n),\r\n(Data\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryTerminatedEvent\"\r\n| count as Value\r\n| extend Name=\"Total Query Terminateds\", Order=3\r\n)\r\n", "size": 4, "showAnnotations": true, "showAnalytics": true, @@ -1637,7 +1638,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "SparkListenerEvent_CL\r\n| where workspaceName_s == \"{Workspace}\" and clusterName_s == \"{SparkPool}\" and livyId_s == \"{LivyId}\"\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent\"\r\n| project TimeGenerated, progress_sources=parse_json(progress_sources_s)\r\n| mv-expand progress_sources\r\n| project TimeGenerated, Description=tostring(progress_sources.description), ProcessedRowsPerSecond=todouble(progress_sources.processedRowsPerSecond)\r\n| summarize Value=sum(ProcessedRowsPerSecond) by TimeGenerated, Description\r\n| order by TimeGenerated asc\r\n\r\n// InputRowsPerSecond=todouble(progress_sources.inputRowsPerSecond), ", + "query": "SparkListenerEvent_CL\r\n| where workspaceName_s == \"{Workspace}\" and clusterName_s == \"{SparkPool}\" and livyId_s == \"{LivyId}\"\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent\"\r\n| project TimeGenerated, progress_sources=parse_json(column_ifexists(\"progress_sources_s\", \"{}\"))\r\n| mv-expand progress_sources\r\n| project TimeGenerated, Description=tostring(progress_sources.description), ProcessedRowsPerSecond=todouble(progress_sources.processedRowsPerSecond)\r\n| summarize Value=sum(ProcessedRowsPerSecond) by TimeGenerated, Description\r\n| order by TimeGenerated asc\r\n\r\n// InputRowsPerSecond=todouble(progress_sources.inputRowsPerSecond), ", "size": 1, "aggregation": 5, "showAnalytics": true, @@ -1664,7 +1665,7 @@ "type": 3, "content": { "version": "KqlItem/1.0", - "query": "SparkListenerEvent_CL\r\n| where workspaceName_s == \"{Workspace}\" and clusterName_s == \"{SparkPool}\" and livyId_s == \"{LivyId}\"\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent\"\r\n| project TimeGenerated, progress_sources=parse_json(progress_sources_s)\r\n| mv-expand progress_sources\r\n| project TimeGenerated, Description=tostring(progress_sources.description), InputRows=tolong(progress_sources.numInputRows)\r\n| summarize sum(InputRows)/count() by bin(TimeGenerated, timespan({Interval})), Description\r\n| order by TimeGenerated asc", + "query": "SparkListenerEvent_CL\r\n| where workspaceName_s == \"{Workspace}\" and clusterName_s == \"{SparkPool}\" and livyId_s == \"{LivyId}\"\r\n| where Event_s == \"org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgressEvent\"\r\n| project TimeGenerated, progress_sources=parse_json(column_ifexists(\"progress_sources_s\", \"{}\"))\r\n| mv-expand progress_sources\r\n| project TimeGenerated, Description=tostring(progress_sources.description), InputRows=tolong(progress_sources.numInputRows)\r\n| summarize sum(InputRows)/count() by bin(TimeGenerated, timespan({Interval})), Description\r\n| order by TimeGenerated asc", "size": 1, "aggregation": 5, "showAnalytics": true, From 168408f119da1534d4f8fd114818d15b02133344 Mon Sep 17 00:00:00 2001 From: "Jovan Popovic (MSFT)" Date: Thu, 15 Jul 2021 17:27:11 +0200 Subject: [PATCH 03/25] Adding Delta Lake settings --- SQL/Samples/LdwSample/SampleDB.sql | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/SQL/Samples/LdwSample/SampleDB.sql b/SQL/Samples/LdwSample/SampleDB.sql index 329d603..c533051 100644 --- a/SQL/Samples/LdwSample/SampleDB.sql +++ b/SQL/Samples/LdwSample/SampleDB.sql @@ -29,6 +29,10 @@ IF (EXISTS(SELECT * FROM sys.external_file_formats WHERE name = 'NativeParquet') DROP EXTERNAL FILE FORMAT NativeParquet END GO +IF (EXISTS(SELECT * FROM sys.external_file_formats WHERE name = 'DeltaLakeFormat')) BEGIN + DROP EXTERNAL FILE FORMAT DeltaLakeFormat +END +GO DROP SCHEMA IF EXISTS parquet; GO DROP SCHEMA IF EXISTS csv; @@ -40,6 +44,10 @@ IF (EXISTS(SELECT * FROM sys.external_data_sources WHERE name = 'SqlOnDemandDemo DROP EXTERNAL DATA SOURCE SqlOnDemandDemo END +IF (EXISTS(SELECT * FROM sys.external_data_sources WHERE name = 'DeltaLakeStorage')) BEGIN + DROP EXTERNAL DATA SOURCE DeltaLakeStorage +END + IF (EXISTS(SELECT * FROM sys.external_data_sources WHERE name = 'AzureOpenData')) BEGIN DROP EXTERNAL DATA SOURCE AzureOpenData END @@ -112,6 +120,14 @@ CREATE EXTERNAL DATA SOURCE SqlOnDemandDemo WITH ( CREDENTIAL = sqlondemand ); GO + +-- Data source referencing Delta Lake folders +CREATE EXTERNAL DATA SOURCE DeltaLakeStorage WITH ( + LOCATION = 'https://sqlondemandstorage.blob.core.windows.net/delta-lake', + CREDENTIAL = sqlondemand +); +GO + -- Create publicly available external data sources CREATE EXTERNAL DATA SOURCE AzureOpenData WITH ( LOCATION = 'https://azureopendatastorage.blob.core.windows.net/') @@ -148,6 +164,11 @@ WITH ( FORMAT_TYPE = PARQUET ); GO +CREATE EXTERNAL FILE FORMAT DeltaLakeFormat +WITH ( + FORMAT_TYPE = DELTA +); +GO CREATE EXTERNAL TABLE csv.population ( From f01e4b4c69eb3fc9f670c8df181f00a6fbf9682b Mon Sep 17 00:00:00 2001 From: "Jovan Popovic (MSFT)" Date: Wed, 21 Jul 2021 16:03:35 +0200 Subject: [PATCH 04/25] Added TPCDS views --- SQL/databases/tpcds/schema-views.sql | 689 +++++++++++++++++++++++++++ 1 file changed, 689 insertions(+) create mode 100644 SQL/databases/tpcds/schema-views.sql diff --git a/SQL/databases/tpcds/schema-views.sql b/SQL/databases/tpcds/schema-views.sql new file mode 100644 index 0000000..c914f2c --- /dev/null +++ b/SQL/databases/tpcds/schema-views.sql @@ -0,0 +1,689 @@ +DROP VIEW IF EXISTS [call_center]; +GO + +DROP VIEW IF EXISTS [catalog_page]; +GO + +DROP VIEW IF EXISTS [catalog_returns]; +GO + +DROP VIEW IF EXISTS [catalog_sales]; +GO + +DROP VIEW IF EXISTS [customer]; +GO + +DROP VIEW IF EXISTS [customer_address]; +GO + +DROP VIEW IF EXISTS [customer_demographics]; +GO + +DROP VIEW IF EXISTS [date_dim]; +GO + +DROP VIEW IF EXISTS [household_demographics]; +GO + +DROP VIEW IF EXISTS [income_band]; +GO + +DROP VIEW IF EXISTS [inventory]; +GO + +DROP VIEW IF EXISTS [item]; +GO + +DROP VIEW IF EXISTS [promotion]; +GO + +DROP VIEW IF EXISTS [reason]; +GO + +DROP VIEW IF EXISTS [ship_mode]; +GO + +DROP VIEW IF EXISTS [store]; +GO + +DROP VIEW IF EXISTS [store_returns]; +GO + +DROP VIEW IF EXISTS [store_sales]; +GO + +DROP VIEW IF EXISTS [time_dim]; +GO + +DROP VIEW IF EXISTS [warehouse]; +GO + +DROP VIEW IF EXISTS [web_page]; +GO + +DROP VIEW IF EXISTS [web_site]; +GO + +DROP VIEW IF EXISTS [web_returns]; +GO + +DROP VIEW IF EXISTS [web_sales]; +GO + + +CREATE VIEW [call_center] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/call_center/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + CC_CALL_CENTER_SK integer, + CC_CALL_CENTER_ID char(16) COLLATE Latin1_General_100_BIN2_UTF8, + CC_REC_START_DATE date, + CC_REC_END_DATE date, + CC_CLOSED_DATE_SK integer, + CC_OPEN_DATE_SK integer, + CC_NAME varchar(50) COLLATE Latin1_General_100_BIN2_UTF8, + CC_CLASS varchar(50) COLLATE Latin1_General_100_BIN2_UTF8, + CC_EMPLOYEES integer, + CC_SQ_FT integer, + CC_HOURS char(20) COLLATE Latin1_General_100_BIN2_UTF8, + CC_MANAGER varchar(40) COLLATE Latin1_General_100_BIN2_UTF8, + CC_MKT_ID integer, + CC_MKT_CLASS char(50) COLLATE Latin1_General_100_BIN2_UTF8, + CC_MKT_DESC varchar(100) COLLATE Latin1_General_100_BIN2_UTF8, + CC_MARKET_MANAGER varchar(40) COLLATE Latin1_General_100_BIN2_UTF8, + CC_DIVISION integer, + CC_DIVISION_NAME varchar(50) COLLATE Latin1_General_100_BIN2_UTF8, + CC_COMPANY integer, + CC_COMPANY_NAME char(50) COLLATE Latin1_General_100_BIN2_UTF8, + CC_STREET_NUMBER char(10) COLLATE Latin1_General_100_BIN2_UTF8, + CC_STREET_NAME varchar(60) COLLATE Latin1_General_100_BIN2_UTF8, + CC_STREET_TYPE char(15) COLLATE Latin1_General_100_BIN2_UTF8, + CC_SUITE_NUMBER char(10) COLLATE Latin1_General_100_BIN2_UTF8, + CC_CITY varchar(60) COLLATE Latin1_General_100_BIN2_UTF8, + CC_COUNTY varchar(30) COLLATE Latin1_General_100_BIN2_UTF8, + CC_STATE char(2) COLLATE Latin1_General_100_BIN2_UTF8, + CC_ZIP char(10) COLLATE Latin1_General_100_BIN2_UTF8, + CC_COUNTRY varchar(20) COLLATE Latin1_General_100_BIN2_UTF8, + CC_GMT_OFFSET decimal(5,2), + CC_TAX_PERCENTAGE decimal(5,2) +) AS call_center; +GO + +CREATE VIEW [catalog_page] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/catalog_page/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + CP_CATALOG_PAGE_SK integer, + CP_CATALOG_PAGE_ID char(16) COLLATE Latin1_General_100_BIN2_UTF8, + CP_START_DATE_SK integer, + CP_END_DATE_SK integer, + CP_DEPARTMENT varchar(50) COLLATE Latin1_General_100_BIN2_UTF8, + CP_CATALOG_NUMBER integer, + CP_CATALOG_PAGE_NUMBER integer, + CP_DESCRIPTION varchar(100) COLLATE Latin1_General_100_BIN2_UTF8, + CP_TYPE varchar(100) COLLATE Latin1_General_100_BIN2_UTF8 +) AS catalog_page; +GO + +CREATE VIEW [catalog_returns] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/catalog_returns/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + CR_RETURNED_DATE_SK integer, + CR_RETURNED_TIME_SK integer, + CR_ITEM_SK integer, + CR_REFUNDED_CUSTOMER_SK integer, + CR_REFUNDED_CDEMO_SK integer, + CR_REFUNDED_HDEMO_SK integer, + CR_REFUNDED_ADDR_SK integer, + CR_RETURNING_CUSTOMER_SK integer, + CR_RETURNING_CDEMO_SK integer, + CR_RETURNING_HDEMO_SK integer, + CR_RETURNING_ADDR_SK integer, + CR_CALL_CENTER_SK integer, + CR_CATALOG_PAGE_SK integer, + CR_SHIP_MODE_SK integer, + CR_WAREHOUSE_SK integer, + CR_REASON_SK integer, + CR_ORDER_NUMBER bigint, + CR_RETURN_QUANTITY integer, + CR_RETURN_AMOUNT decimal(7,2), + CR_RETURN_TAX decimal(7,2), + CR_RETURN_AMT_INC_TAX decimal(7,2), + CR_FEE decimal(7,2), + CR_RETURN_SHIP_COST decimal(7,2), + CR_REFUNDED_CASH decimal(7,2), + CR_REVERSED_CHARGE decimal(7,2), + CR_STORE_CREDIT decimal(7,2), + CR_NET_LOSS decimal(7,2) +) AS catalog_returns; +GO + +CREATE VIEW [catalog_sales] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/catalog_sales/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + CS_SOLD_DATE_SK integer, + CS_SOLD_TIME_SK integer, + CS_SHIP_DATE_SK integer, + CS_BILL_CUSTOMER_SK integer, + CS_BILL_CDEMO_SK integer, + CS_BILL_HDEMO_SK integer, + CS_BILL_ADDR_SK integer, + CS_SHIP_CUSTOMER_SK integer, + CS_SHIP_CDEMO_SK integer, + CS_SHIP_HDEMO_SK integer, + CS_SHIP_ADDR_SK integer, + CS_CALL_CENTER_SK integer, + CS_CATALOG_PAGE_SK integer, + CS_SHIP_MODE_SK integer, + CS_WAREHOUSE_SK integer, + CS_ITEM_SK integer, + CS_PROMO_SK integer, + CS_ORDER_NUMBER bigint, + CS_QUANTITY integer, + CS_WHOLESALE_COST decimal(7,2), + CS_LIST_PRICE decimal(7,2), + CS_SALES_PRICE decimal(7,2), + CS_EXT_DISCOUNT_AMT decimal(7,2), + CS_EXT_SALES_PRICE decimal(7,2), + CS_EXT_WHOLESALE_COST decimal(7,2), + CS_EXT_LIST_PRICE decimal(7,2), + CS_EXT_TAX decimal(7,2), + CS_COUPON_AMT decimal(7,2), + CS_EXT_SHIP_COST decimal(7,2), + CS_NET_PAID decimal(7,2), + CS_NET_PAID_INC_TAX decimal(7,2), + CS_NET_PAID_INC_SHIP decimal(7,2), + CS_NET_PAID_INC_SHIP_TAX decimal(7,2), + CS_NET_PROFIT decimal(7,2) +) AS catalog_sales; +GO + +CREATE VIEW [customer] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/customer/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + C_CUSTOMER_SK integer, + C_CUSTOMER_ID char(16) COLLATE Latin1_General_100_BIN2_UTF8, + C_CURRENT_CDEMO_SK integer, + C_CURRENT_HDEMO_SK integer, + C_CURRENT_ADDR_SK integer, + C_FIRST_SHIPTO_DATE_SK integer, + C_FIRST_SALES_DATE_SK integer, + C_SALUTATION char(10) COLLATE Latin1_General_100_BIN2_UTF8, + C_FIRST_NAME char(20) COLLATE Latin1_General_100_BIN2_UTF8, + C_LAST_NAME char(30) COLLATE Latin1_General_100_BIN2_UTF8, + C_PREFERRED_CUST_FLAG char(1) COLLATE Latin1_General_100_BIN2_UTF8, + C_BIRTH_DAY integer, + C_BIRTH_MONTH integer, + C_BIRTH_YEAR integer, + C_BIRTH_COUNTRY varchar(20) COLLATE Latin1_General_100_BIN2_UTF8, + C_LOGIN char(13) COLLATE Latin1_General_100_BIN2_UTF8, + C_EMAIL_ADDRESS char(50) COLLATE Latin1_General_100_BIN2_UTF8, + C_LAST_REVIEW_DATE_SK integer +) AS customer; +GO + +CREATE VIEW [customer_address] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/customer_address/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + CA_ADDRESS_SK integer, + CA_ADDRESS_ID char(16) COLLATE Latin1_General_100_BIN2_UTF8, + CA_STREET_NUMBER char(10) COLLATE Latin1_General_100_BIN2_UTF8, + CA_STREET_NAME varchar(60) COLLATE Latin1_General_100_BIN2_UTF8, + CA_STREET_TYPE char(15) COLLATE Latin1_General_100_BIN2_UTF8, + CA_SUITE_NUMBER char(10) COLLATE Latin1_General_100_BIN2_UTF8, + CA_CITY varchar(60) COLLATE Latin1_General_100_BIN2_UTF8, + CA_COUNTY varchar(30) COLLATE Latin1_General_100_BIN2_UTF8, + CA_STATE char(2) COLLATE Latin1_General_100_BIN2_UTF8, + CA_ZIP char(10) COLLATE Latin1_General_100_BIN2_UTF8, + CA_COUNTRY varchar(20) COLLATE Latin1_General_100_BIN2_UTF8, + CA_GMT_OFFSET decimal(5,2), + CA_LOCATION_TYPE char(20) COLLATE Latin1_General_100_BIN2_UTF8 +) AS customer_address; +GO + +CREATE VIEW [customer_demographics] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/customer_demographics/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + CD_DEMO_SK integer, + CD_GENDER char(1) COLLATE Latin1_General_100_BIN2_UTF8, + CD_MARITAL_STATUS char(1) COLLATE Latin1_General_100_BIN2_UTF8, + CD_EDUCATION_STATUS char(20) COLLATE Latin1_General_100_BIN2_UTF8, + CD_PURCHASE_ESTIMATE integer, + CD_CREDIT_RATING char(10) COLLATE Latin1_General_100_BIN2_UTF8, + CD_DEP_COUNT integer, + CD_DEP_EMPLOYED_COUNT integer, + CD_DEP_COLLEGE_COUNT integer +) AS customer_demographics; +GO + +CREATE VIEW [date_dim] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/date_dim/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + D_DATE_SK integer, + D_DATE_ID char(16) COLLATE Latin1_General_100_BIN2_UTF8, + D_DATE date, + D_MONTH_SEQ integer, + D_WEEK_SEQ integer, + D_QUARTER_SEQ integer, + D_YEAR integer, + D_DOW integer, + D_MOY integer, + D_DOM integer, + D_QOY integer, + D_FY_YEAR integer, + D_FY_QUARTER_SEQ integer, + D_FY_WEEK_SEQ integer, + D_DAY_NAME char(9) COLLATE Latin1_General_100_BIN2_UTF8, + D_QUARTER_NAME char(6) COLLATE Latin1_General_100_BIN2_UTF8, + D_HOLIDAY char(1) COLLATE Latin1_General_100_BIN2_UTF8, + D_WEEKEND char(1) COLLATE Latin1_General_100_BIN2_UTF8, + D_FOLLOWING_HOLIDAY char(1) COLLATE Latin1_General_100_BIN2_UTF8, + D_FIRST_DOM integer, + D_LAST_DOM integer, + D_SAME_DAY_LY integer, + D_SAME_DAY_LQ integer, + D_CURRENT_DAY char(1) COLLATE Latin1_General_100_BIN2_UTF8, + D_CURRENT_WEEK char(1) COLLATE Latin1_General_100_BIN2_UTF8, + D_CURRENT_MONTH char(1) COLLATE Latin1_General_100_BIN2_UTF8, + D_CURRENT_QUARTER char(1) COLLATE Latin1_General_100_BIN2_UTF8, + D_CURRENT_YEAR char(1) COLLATE Latin1_General_100_BIN2_UTF8 +) AS date_dim; +GO + +CREATE VIEW [household_demographics] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/household_demographics/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + HD_DEMO_SK integer, + HD_INCOME_BAND_SK integer, + HD_BUY_POTENTIAL char(15) COLLATE Latin1_General_100_BIN2_UTF8, + HD_DEP_COUNT integer, + HD_VEHICLE_COUNT integer +) AS household_demographics; +GO + +CREATE VIEW [income_band] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/income_band/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + IB_INCOME_BAND_SK integer, + IB_LOWER_BOUND integer, + IB_UPPER_BOUND integer +) AS income_band; +GO + +CREATE VIEW [inventory] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/inventory/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + INV_DATE_SK integer, + INV_ITEM_SK integer, + INV_WAREHOUSE_SK integer, + INV_QUANTITY_ON_HAND integer +) AS inventory; +GO + +CREATE VIEW [item] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/item/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + I_ITEM_SK integer, + I_ITEM_ID char(16) COLLATE Latin1_General_100_BIN2_UTF8, + I_REC_START_DATE date, + I_REC_END_DATE date, + I_ITEM_DESC varchar(200) COLLATE Latin1_General_100_BIN2_UTF8, + I_CURRENT_PRICE decimal(7,2), + I_WHOLESALE_COST decimal(7,2), + I_BRAND_ID integer, + I_BRAND char(50) COLLATE Latin1_General_100_BIN2_UTF8, + I_CLASS_ID integer, + I_CLASS char(50) COLLATE Latin1_General_100_BIN2_UTF8, + I_CTGRY_ID integer, + I_CTGRY char(50) COLLATE Latin1_General_100_BIN2_UTF8, + I_MANUFACT_ID integer, + I_MANUFACT char(50) COLLATE Latin1_General_100_BIN2_UTF8, + I_SIZE char(20) COLLATE Latin1_General_100_BIN2_UTF8, + I_FORMULATION char(20) COLLATE Latin1_General_100_BIN2_UTF8, + I_COLOR char(20) COLLATE Latin1_General_100_BIN2_UTF8, + I_UNITS char(10) COLLATE Latin1_General_100_BIN2_UTF8, + I_CONTAINER char(10) COLLATE Latin1_General_100_BIN2_UTF8, + I_MANAGER_ID integer, + I_PRODUCT_NAME char(50) COLLATE Latin1_General_100_BIN2_UTF8 +) AS item; +GO + +CREATE VIEW [promotion] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/promotion/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + P_PROMO_SK integer, + P_PROMO_ID char(16) COLLATE Latin1_General_100_BIN2_UTF8, + P_START_DATE_SK integer, + P_END_DATE_SK integer, + P_ITEM_SK integer, + P_COST decimal(15,2), + P_RESPONSE_TARGET integer, + P_PROMO_NAME char(50) COLLATE Latin1_General_100_BIN2_UTF8, + P_CHANNEL_DMAIL char(1) COLLATE Latin1_General_100_BIN2_UTF8, + P_CHANNEL_EMAIL char(1) COLLATE Latin1_General_100_BIN2_UTF8, + P_CHANNEL_CATALOG char(1) COLLATE Latin1_General_100_BIN2_UTF8, + P_CHANNEL_TV char(1) COLLATE Latin1_General_100_BIN2_UTF8, + P_CHANNEL_RADIO char(1) COLLATE Latin1_General_100_BIN2_UTF8, + P_CHANNEL_PRESS char(1) COLLATE Latin1_General_100_BIN2_UTF8, + P_CHANNEL_EVENT char(1) COLLATE Latin1_General_100_BIN2_UTF8, + P_CHANNEL_DEMO char(1) COLLATE Latin1_General_100_BIN2_UTF8, + P_CHANNEL_DETAILS varchar(100) COLLATE Latin1_General_100_BIN2_UTF8, + P_PURPOSE char(15) COLLATE Latin1_General_100_BIN2_UTF8, + P_DISCOUNT_ACTIVE char(1) COLLATE Latin1_General_100_BIN2_UTF8 +) AS promotion; +GO + +CREATE VIEW [reason] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/reason/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + R_REASON_SK integer, + R_REASON_ID char(16) COLLATE Latin1_General_100_BIN2_UTF8, + R_REASON_DESC char(100) COLLATE Latin1_General_100_BIN2_UTF8 +) AS reason; +GO + +CREATE VIEW [ship_mode] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/ship_mode/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + SM_SHIP_MODE_SK integer, + SM_SHIP_MODE_ID char(16) COLLATE Latin1_General_100_BIN2_UTF8, + SM_TYPE char(30) COLLATE Latin1_General_100_BIN2_UTF8, + SM_CODE char(10) COLLATE Latin1_General_100_BIN2_UTF8, + SM_CARRIER char(20) COLLATE Latin1_General_100_BIN2_UTF8, + SM_CONTRACT char(20) COLLATE Latin1_General_100_BIN2_UTF8 +) AS ship_mode; +GO + +CREATE VIEW [store] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/store/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + S_STORE_SK integer, + S_STORE_ID char(16) COLLATE Latin1_General_100_BIN2_UTF8, + S_REC_START_DATE date, + S_REC_END_DATE date, + S_CLOSED_DATE_SK integer, + S_STORE_NAME varchar(50) COLLATE Latin1_General_100_BIN2_UTF8, + S_NUMBER_EMPLOYEES integer, + S_FLOOR_SPACE integer, + S_HOURS char(20) COLLATE Latin1_General_100_BIN2_UTF8, + S_MANAGER varchar(40) COLLATE Latin1_General_100_BIN2_UTF8, + S_MARKET_ID integer, + S_GEOGRAPHY_CLASS varchar(100) COLLATE Latin1_General_100_BIN2_UTF8, + S_MARKET_DESC varchar(100) COLLATE Latin1_General_100_BIN2_UTF8, + S_MARKET_MANAGER varchar(40) COLLATE Latin1_General_100_BIN2_UTF8, + S_DIVISION_ID integer, + S_DIVISION_NAME varchar(50) COLLATE Latin1_General_100_BIN2_UTF8, + S_COMPANY_ID integer, + S_COMPANY_NAME varchar(50) COLLATE Latin1_General_100_BIN2_UTF8, + S_STREET_NUMBER varchar(10) COLLATE Latin1_General_100_BIN2_UTF8, + S_STREET_NAME varchar(60) COLLATE Latin1_General_100_BIN2_UTF8, + S_STREET_TYPE char(15) COLLATE Latin1_General_100_BIN2_UTF8, + S_SUITE_NUMBER char(10) COLLATE Latin1_General_100_BIN2_UTF8, + S_CITY varchar(60) COLLATE Latin1_General_100_BIN2_UTF8, + S_COUNTY varchar(30) COLLATE Latin1_General_100_BIN2_UTF8, + S_STATE char(2) COLLATE Latin1_General_100_BIN2_UTF8, + S_ZIP char(10) COLLATE Latin1_General_100_BIN2_UTF8, + S_COUNTRY varchar(20) COLLATE Latin1_General_100_BIN2_UTF8, + S_GMT_OFFSET decimal(5,2), + S_TAX_PRECENTAGE decimal(5,2) +) AS store; +GO + +CREATE VIEW [store_returns] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/store_returns/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + SR_RETURNED_DATE_SK integer, + SR_RETURN_TIME_SK integer, + SR_ITEM_SK integer, + SR_CUSTOMER_SK integer, + SR_CDEMO_SK integer, + SR_HDEMO_SK integer, + SR_ADDR_SK integer, + SR_STORE_SK integer, + SR_REASON_SK integer, + SR_TICKET_NUMBER integer, + SR_RETURN_QUANTITY integer, + SR_RETURN_AMT decimal(7,2), + SR_RETURN_TAX decimal(7,2), + SR_RETURN_AMT_INC_TAX decimal(7,2), + SR_FEE decimal(7,2), + SR_RETURN_SHIP_COST decimal(7,2), + SR_REFUNDED_CASH decimal(7,2), + SR_REVERSED_CHARGE decimal(7,2), + SR_STORE_CREDIT decimal(7,2), + SR_NET_LOSS decimal(7,2) +) AS store_returns; +GO + +CREATE VIEW [store_sales] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/store_sales/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + SS_SOLD_DATE_SK integer, + SS_SOLD_TIME_SK integer, + SS_ITEM_SK integer, + SS_CUSTOMER_SK integer, + SS_CDEMO_SK integer, + SS_HDEMO_SK integer, + SS_ADDR_SK integer, + SS_STORE_SK integer, + SS_PROMO_SK integer, + SS_TICKET_NUMBER integer, + SS_QUANTITY integer, + SS_WHOLESALE_COST decimal(7, 2), + SS_LIST_PRICE decimal(7, 2), + SS_SALES_PRICE decimal(7, 2), + SS_EXT_DISCOUNT_AMT decimal(7, 2), + SS_EXT_SALES_PRICE decimal(7, 2), + SS_EXT_WHOLESALE_COST decimal(7, 2), + SS_EXT_LIST_PRICE decimal(7, 2), + SS_EXT_TAX decimal(7, 2), + SS_COUPON_AMT decimal(7, 2), + SS_NET_PAID decimal(7, 2), + SS_NET_PAID_INC_TAX decimal(7, 2), + SS_NET_PROFIT decimal(7, 2) +) AS store_sales; +GO + +CREATE VIEW [time_dim] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/time_dim/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + T_TIME_SK integer, + T_TIME_ID char(16) COLLATE Latin1_General_100_BIN2_UTF8, + T_TIME integer, + T_HOUR integer, + T_MINUTE integer, + T_SECOND integer, + T_AM_PM char(2) COLLATE Latin1_General_100_BIN2_UTF8, + T_SHIFT char(20) COLLATE Latin1_General_100_BIN2_UTF8, + T_SUB_SHIFT char(20) COLLATE Latin1_General_100_BIN2_UTF8, + T_MEAL_TIME char(20) COLLATE Latin1_General_100_BIN2_UTF8 +) AS time_dim; +GO + +CREATE VIEW [warehouse] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/warehouse/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + W_WAREHOUSE_SK integer, + W_WAREHOUSE_ID char(16) COLLATE Latin1_General_100_BIN2_UTF8, + W_WAREHOUSE_NAME varchar(20) COLLATE Latin1_General_100_BIN2_UTF8, + W_WAREHOUSE_SQ_FT integer, + W_STREET_NUMBER char(10) COLLATE Latin1_General_100_BIN2_UTF8, + W_STREET_NAME varchar(60) COLLATE Latin1_General_100_BIN2_UTF8, + W_STREET_TYPE char(15) COLLATE Latin1_General_100_BIN2_UTF8, + W_SUITE_NUMBER char(10) COLLATE Latin1_General_100_BIN2_UTF8, + W_CITY varchar(60) COLLATE Latin1_General_100_BIN2_UTF8, + W_COUNTY varchar(30) COLLATE Latin1_General_100_BIN2_UTF8, + W_STATE char(2) COLLATE Latin1_General_100_BIN2_UTF8, + W_ZIP char(10) COLLATE Latin1_General_100_BIN2_UTF8, + W_COUNTRY varchar(20) COLLATE Latin1_General_100_BIN2_UTF8, + W_GMT_OFFSET decimal(5,2) +) AS warehouse; +GO + +CREATE VIEW [web_page] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/web_page/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + WP_WEB_PAGE_SK integer, + WP_WEB_PAGE_ID char(16) COLLATE Latin1_General_100_BIN2_UTF8, + WP_REC_START_DATE date, + WP_REC_END_DATE date, + WP_CREATION_DATE_SK integer, + WP_ACCESS_DATE_SK integer, + WP_AUTOGEN_FLAG char(1) COLLATE Latin1_General_100_BIN2_UTF8, + WP_CUSTOMER_SK integer, + WP_URL varchar(100) COLLATE Latin1_General_100_BIN2_UTF8, + WP_TYPE char(50) COLLATE Latin1_General_100_BIN2_UTF8, + WP_CHAR_COUNT integer, + WP_LINK_COUNT integer, + WP_IMAGE_COUNT integer, + WP_MAX_AD_COUNT integer +) AS web_page; +GO + +CREATE VIEW [web_returns] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/web_returns/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + WR_RETURNED_DATE_SK integer, + WR_RETURNED_TIME_SK integer, + WR_ITEM_SK integer, + WR_REFUNDED_CUSTOMER_SK integer, + WR_REFUNDED_CDEMO_SK integer, + WR_REFUNDED_HDEMO_SK integer, + WR_REFUNDED_ADDR_SK integer, + WR_RETURNING_CUSTOMER_SK integer, + WR_RETURNING_CDEMO_SK integer, + WR_RETURNING_HDEMO_SK integer, + WR_RETURNING_ADDR_SK integer, + WR_WEB_PAGE_SK integer, + WR_REASON_SK integer, + WR_ORDER_NUMBER integer, + WR_RETURN_QUANTITY integer, + WR_RETURN_AMT decimal(7,2), + WR_RETURN_TAX decimal(7,2), + WR_RETURN_AMT_INC_TAX decimal(7,2), + WR_FEE decimal(7,2), + WR_RETURN_SHIP_COST decimal(7,2), + WR_REFUNDED_CASH decimal(7,2), + WR_REVERSED_CHARGE decimal(7,2), + WR_ACCOUNT_CREDIT decimal(7,2), + WR_NET_LOSS decimal(7,2) +) AS web_returns; +GO + +CREATE VIEW [web_sales] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/web_sales/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + WS_SOLD_DATE_SK integer, + WS_SOLD_TIME_SK integer, + WS_SHIP_DATE_SK integer, + WS_ITEM_SK integer, + WS_BILL_CUSTOMER_SK integer, + WS_BILL_CDEMO_SK integer, + WS_BILL_HDEMO_SK integer, + WS_BILL_ADDR_SK integer, + WS_SHIP_CUSTOMER_SK integer, + WS_SHIP_CDEMO_SK integer, + WS_SHIP_HDEMO_SK integer, + WS_SHIP_ADDR_SK integer, + WS_WEB_PAGE_SK integer, + WS_WEB_SITE_SK integer, + WS_SHIP_MODE_SK integer, + WS_WAREHOUSE_SK integer, + WS_PROMO_SK integer, + WS_ORDER_NUMBER integer, + WS_QUANTITY integer, + WS_WHOLESALE_COST decimal(7,2), + WS_LIST_PRICE decimal(7,2), + WS_SALES_PRICE decimal(7,2), + WS_EXT_DISCOUNT_AMT decimal(7,2), + WS_EXT_SALES_PRICE decimal(7,2), + WS_EXT_WHOLESALE_COST decimal(7,2), + WS_EXT_LIST_PRICE decimal(7,2), + WS_EXT_TAX decimal(7,2), + WS_COUPON_AMT decimal(7,2), + WS_EXT_SHIP_COST decimal(7,2), + WS_NET_PAID decimal(7,2), + WS_NET_PAID_INC_TAX decimal(7,2), + WS_NET_PAID_INC_SHIP decimal(7,2), + WS_NET_PAID_INC_SHIP_TAX decimal(7,2), + WS_NET_PROFIT decimal(7,2) +) AS web_sales; +GO + +CREATE VIEW [web_site] AS +SELECT * FROM +OPENROWSET( + BULK N'parquet/web_site/*', FORMAT = 'PARQUET', FIELDTERMINATOR = '|', DATA_SOURCE = 'tpcds_data') + WITH ( + WEB_SITE_SK integer, + WEB_SITE_ID char(16) COLLATE Latin1_General_100_BIN2_UTF8, + WEB_REC_START_DATE date, + WEB_REC_END_DATE date, + WEB_NAME varchar(50) COLLATE Latin1_General_100_BIN2_UTF8, + WEB_OPEN_DATE_SK integer, + WEB_CLOSE_DATE_SK integer, + WEB_CLASS varchar(50) COLLATE Latin1_General_100_BIN2_UTF8, + WEB_MANAGER varchar(40) COLLATE Latin1_General_100_BIN2_UTF8, + WEB_MKT_ID integer, + WEB_MKT_CLASS varchar(50) COLLATE Latin1_General_100_BIN2_UTF8, + WEB_MKT_DESC varchar(100) COLLATE Latin1_General_100_BIN2_UTF8, + WEB_MARKET_MANAGER varchar(40) COLLATE Latin1_General_100_BIN2_UTF8, + WEB_COMPANY_ID integer, + WEB_COMPANY_NAME char(50) COLLATE Latin1_General_100_BIN2_UTF8, + WEB_STREET_NUMBER char(10) COLLATE Latin1_General_100_BIN2_UTF8, + WEB_STREET_NAME varchar(60) COLLATE Latin1_General_100_BIN2_UTF8, + WEB_STREET_TYPE char(15) COLLATE Latin1_General_100_BIN2_UTF8, + WEB_SUITE_NUMBER char(10) COLLATE Latin1_General_100_BIN2_UTF8, + WEB_CITY varchar(60) COLLATE Latin1_General_100_BIN2_UTF8, + WEB_COUNTY varchar(30) COLLATE Latin1_General_100_BIN2_UTF8, + WEB_STATE char(2) COLLATE Latin1_General_100_BIN2_UTF8, + WEB_ZIP char(10) COLLATE Latin1_General_100_BIN2_UTF8, + WEB_COUNTRY varchar(20) COLLATE Latin1_General_100_BIN2_UTF8, + WEB_GMT_OFFSET decimal(5,2), + WEB_TAX_PERCENTAGE decimal(5,2) +) AS web_site; +GO From ca9db824f00cc1f30c2daa1de9a226b11472af3b Mon Sep 17 00:00:00 2001 From: Grace Date: Tue, 27 Jul 2021 13:29:28 +0800 Subject: [PATCH 05/25] fix bug of pagination --- .../ImportADFtoSynapse/importADFtoSynapseTool.ps1 | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Pipelines/ImportADFtoSynapse/importADFtoSynapseTool.ps1 b/Pipelines/ImportADFtoSynapse/importADFtoSynapseTool.ps1 index 28bb84b..6c519b3 100644 --- a/Pipelines/ImportADFtoSynapse/importADFtoSynapseTool.ps1 +++ b/Pipelines/ImportADFtoSynapse/importADFtoSynapseTool.ps1 @@ -354,6 +354,16 @@ function ProcessResource { Write-Host "" Write-Host "Processing $resourceType" -ForegroundColor White $resourcesToBeCopied.AddRange($srcResponse.Value); + + while ($srcResponse.PSobject.Properties.Name.Contains("nextLink")) { + Write-Host "Processing next page $srcResponse.nextLink" + $nextLink = $srcResponse.nextLink + $srcResponse = Invoke-RestMethod -UseBasicParsing -Uri $nextLink -Method Get -ContentType "application/json" -Headers @{ Authorization = "Bearer $token"} + if ($srcResponse.Value.Length -gt 0) { + $resourcesToBeCopied.AddRange($srcResponse.Value); + } + } + WriteSuccessResponse(" Migrating $($resourcesToBeCopied.Count) $resourceType") } elseif($resourcesToBeCopied.Count -le 0) { From f005a6a706beec3569c5d149511000eaf6a06507 Mon Sep 17 00:00:00 2001 From: Marina Levin Date: Tue, 10 Aug 2021 16:48:17 -0400 Subject: [PATCH 06/25] Added my notebook --- .../Scala/Migrate_snowflake_schema.ipynb | 459 ++++++++++++++++++ 1 file changed, 459 insertions(+) create mode 100644 Notebooks/Scala/Migrate_snowflake_schema.ipynb diff --git a/Notebooks/Scala/Migrate_snowflake_schema.ipynb b/Notebooks/Scala/Migrate_snowflake_schema.ipynb new file mode 100644 index 0000000..8b1b5cf --- /dev/null +++ b/Notebooks/Scala/Migrate_snowflake_schema.ipynb @@ -0,0 +1,459 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "## Migrate schema from Snowflake to Synapse SQL Dedicated Pool\r\n", + "\r\n", + "---\r\n", + "\r\n", + "### This notebooks demos how to connect to Snowflake, read data from INFORMATION_SCHEMA, gather list of the tables for given schema and move those tables to Synapse SQL dedicated pool\r\n", + "\r\n", + "
    \r\n", + "
  • Define connection source
  • \r\n", + "
  • Specify connection options for the Snowflake instance
  • \r\n", + "
  • Read Snowflake Information_schema.tables to compyle list of tables for our schema\r\n", + "
      \r\n", + "
    • Read each Snowflake table into into Spark DataFrame
    • \r\n", + "
    • Write DataFrame to table to Synapse SQL Dedicated pool\r\n", + "
    \r\n", + "
  • \r\n", + "
\r\n", + "\r\n", + "\r\n", + "\r\n", + "\r\n" + ], + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + } + }, + { + "cell_type": "code", + "source": [ + "// To use Snowflake as a data source in Spark, use the .format option to provide the Snowflake connector class name that defines the data source.\r\n", + "// Please note that you need to add spark-snowflake_2.12-2.9.0-spark_3.1.jar and snowflake-jdbc-3.13.6.jar to workspace packages as well as to cluster/session packages\r\n", + "\r\n", + "import net.snowflake.spark.snowflake.Utils.SNOWFLAKE_SOURCE_NAME\r\n", + "val SNOWFLAKE_SOURCE_NAME = \"net.snowflake.spark.snowflake\"" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.livy.statement-meta+json": { + "spark_pool": "DemoCluster", + "session_id": 17, + "statement_id": 1, + "state": "finished", + "livy_statement_state": "available", + "queued_time": "2021-08-09T14:14:57.9353514Z", + "session_start_time": "2021-08-09T14:14:57.98765Z", + "execution_start_time": "2021-08-09T14:17:14.6699123Z", + "execution_finish_time": "2021-08-09T14:17:15.2227281Z" + }, + "text/plain": "StatementMeta(DemoCluster, 17, 1, Finished, Available)" + }, + "metadata": {} + }, + { + "output_type": "error", + "ename": "Error", + "evalue": ":32: error: object snowflake is not a member of package net", + "traceback": [ + "Error: :32: error: object snowflake is not a member of package net", + " import net.snowflake.spark.snowflake.Utils.SNOWFLAKE_SOURCE_NAME\n", + " ^\n" + ] + } + ], + "execution_count": 1, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "// setting default paramter \r\n", + "\r\n", + "val sfschema=\"test1\"" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "collapsed": true, + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "tags": [ + "parameters" + ] + } + }, + { + "cell_type": "markdown", + "source": [], + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + } + }, + { + "cell_type": "code", + "source": [ + "// To not expose snowflake credentials, it is a good practice to store user, password and account in Azure Key Vault service in your suscription\r\n", + "// Please note that you need to link your Azure Key Vault service (AKV) to your Synapse workspace \r\n", + "// mssparkutils package let you retrive your secrets from AKV\r\n", + "\r\n", + "val user = mssparkutils.credentials.getSecret(\"azkey2021\", \"sfuser\",\"AzureKeyV1\")\r\n", + "val password = mssparkutils.credentials.getSecret(\"azkey2021\", \"sfpassword\",\"AzureKeyV1\")\r\n", + "val account = mssparkutils.credentials.getSecret(\"azkey2021\", \"sfaccount\",\"AzureKeyV1\")\r\n", + "val account_URL = \"https://\" + account + \".azure.snowflakecomputing.com\"" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.livy.statement-meta+json": { + "spark_pool": "TestS3Cluster", + "session_id": 18, + "statement_id": 2, + "state": "finished", + "livy_statement_state": "available", + "queued_time": "2021-08-04T17:58:15.5479493Z", + "session_start_time": null, + "execution_start_time": "2021-08-04T17:58:15.7178403Z", + "execution_finish_time": "2021-08-04T17:58:23.2651865Z" + }, + "text/plain": "StatementMeta(TestS3Cluster, 18, 2, Finished, Available)" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "user: String = NKADAVIL07\n", + "password: String = Ashburn07\n", + "account: String = yg58220.east-us-2\n", + "account_URL: String = https://yg58220.east-us-2.azure.snowflakecomputing.com\n" + ] + } + ], + "execution_count": 79, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } + }, + { + "cell_type": "code", + "source": [ + "// set up options to connect to Snowflake schema public on TESTDB database\r\n", + " \r\n", + "val sfoptions = Map( \r\n", + " \"sfUrl\" -> account_URL,\r\n", + " \"sfUser\"->user,\r\n", + " \"sfPassword\"-> password,\r\n", + " \"sfDatabase\"-> \"TESTDB\",\r\n", + " \"sfSchema\"-> \"PUBLIC\",\r\n", + " \"sfWarehouse\"-> \"COMPUTE_WH\"\r\n", + ")" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.livy.statement-meta+json": { + "spark_pool": "TestS3Cluster", + "session_id": 18, + "statement_id": 3, + "state": "finished", + "livy_statement_state": "available", + "queued_time": "2021-08-04T17:58:26.7006748Z", + "session_start_time": null, + "execution_start_time": "2021-08-04T17:58:26.8059265Z", + "execution_finish_time": "2021-08-04T17:58:28.1653268Z" + }, + "text/plain": "StatementMeta(TestS3Cluster, 18, 3, Finished, Available)" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "sfoptions: scala.collection.immutable.Map[String,String] = Map(sfUrl -> https://yg58220.east-us-2.azure.snowflakecomputing.com, sfSchema -> PUBLIC, sfPassword -> Ashburn07, sfUser -> NKADAVIL07, sfWarehouse -> COMPUTE_WH, sfDatabase -> TESTDB)\n" + ] + } + ], + "execution_count": 80, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } + }, + { + "cell_type": "code", + "source": [ + "// set up options to connect to schema INFOrMATION_SCHEMA. That schema in Snowflake contain your database metadata\r\n", + " \r\n", + "val sfoptions1 = Map( \r\n", + " \"sfUrl\" -> account_URL,\r\n", + " \"sfUser\"->user,\r\n", + " \"sfPassword\"-> password,\r\n", + " \"sfDatabase\"-> \"TESTDB\",\r\n", + " \"sfSchema\"-> \"INFORMATION_SCHEMA\",\r\n", + " \"sfWarehouse\"-> \"COMPUTE_WH\"\r\n", + ")\r\n" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.livy.statement-meta+json": { + "spark_pool": "TestS3Cluster", + "session_id": 18, + "statement_id": 4, + "state": "finished", + "livy_statement_state": "available", + "queued_time": "2021-08-04T17:58:34.7448636Z", + "session_start_time": null, + "execution_start_time": "2021-08-04T17:58:34.8537732Z", + "execution_finish_time": "2021-08-04T17:58:36.0081044Z" + }, + "text/plain": "StatementMeta(TestS3Cluster, 18, 4, Finished, Available)" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "sfoptions1: scala.collection.immutable.Map[String,String] = Map(sfUrl -> https://yg58220.east-us-2.azure.snowflakecomputing.com, sfSchema -> INFORMATION_SCHEMA, sfPassword -> Ashburn07, sfUser -> NKADAVIL07, sfWarehouse -> COMPUTE_WH, sfDatabase -> TESTDB)\n" + ] + } + ], + "execution_count": 81, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true, + "tags": [] + } + }, + { + "cell_type": "code", + "source": [ + "// read table INFORMATION_SCHEMA.TABLES into DataFrame. We need it to compile list of the tables within our schema\r\n", + "\r\n", + "val df_tl=spark.read.format( SNOWFLAKE_SOURCE_NAME ).options(sfoptions1).option(\"dbtable\",\"TABLES\").load()\r\n", + "//display(df_tl)" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.livy.statement-meta+json": { + "spark_pool": "TestS3Cluster", + "session_id": 18, + "statement_id": 5, + "state": "finished", + "livy_statement_state": "available", + "queued_time": "2021-08-04T17:58:42.8787864Z", + "session_start_time": null, + "execution_start_time": "2021-08-04T17:58:42.9973772Z", + "execution_finish_time": "2021-08-04T17:58:48.579489Z" + }, + "text/plain": "StatementMeta(TestS3Cluster, 18, 5, Finished, Available)" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "StructuredStream-spark package version: 3.0.0-2.1.1\n", + "df_tl: org.apache.spark.sql.DataFrame = [TABLE_CATALOG: string, TABLE_SCHEMA: string ... 20 more fields]\n" + ] + } + ], + "execution_count": 82, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } + }, + { + "cell_type": "code", + "source": [ + "// For easy iteration convert selected info from DataFrame to collection\r\n", + "val df_tab_list = df_tl.select(\"table_schema\", \"table_name\").filter(\"table_schema='PUBLIC'\").collect()\r\n", + "//println(df_tab_list)" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.livy.statement-meta+json": { + "spark_pool": "TestS3Cluster", + "session_id": 18, + "statement_id": 9, + "state": "finished", + "livy_statement_state": "available", + "queued_time": "2021-08-04T18:02:25.0054968Z", + "session_start_time": null, + "execution_start_time": "2021-08-04T18:02:25.209112Z", + "execution_finish_time": "2021-08-04T18:02:34.1314757Z" + }, + "text/plain": "StatementMeta(TestS3Cluster, 18, 9, Finished, Available)" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "df_tab_list: Array[org.apache.spark.sql.Row] = Array([PUBLIC,CUSTOMERT], [PUBLIC,CUSTOMER_TEST], [PUBLIC,NATIONT], [PUBLIC,REGIONT])\n", + "[Lorg.apache.spark.sql.Row;@6d6b91a4\n" + ] + } + ], + "execution_count": 86, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } + }, + { + "cell_type": "code", + "source": [ + "// For each table in the schema read data from Snowflake table into data frame and write it to Synapse SQL Dedicated Pool\r\n", + "\r\n", + "df_tab_list.foreach(row=>\r\n", + " {\r\n", + " val tname = row.getString(1) \r\n", + " //println(tname)\r\n", + " val df_temp=spark.read.format( SNOWFLAKE_SOURCE_NAME ).options(sfoptions).option(\"dbtable\",tname).load()\r\n", + " val target_table = \"SQLdedpool1.\" + sfschema + \".\" + tname\r\n", + " println(target_table)\r\n", + " df_temp.write.synapsesql(target_table, Constants.INTERNAL)\r\n", + " })" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.livy.statement-meta+json": { + "spark_pool": "TestS3Cluster", + "session_id": 18, + "statement_id": 10, + "state": "finished", + "livy_statement_state": "available", + "queued_time": "2021-08-04T18:02:57.9134366Z", + "session_start_time": null, + "execution_start_time": "2021-08-04T18:02:58.0345256Z", + "execution_finish_time": "2021-08-04T18:04:43.7619937Z" + }, + "text/plain": "StatementMeta(TestS3Cluster, 18, 10, Finished, Available)" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "CUSTOMERT\n", + "CUSTOMER_TEST\n", + "NATIONT\n", + "REGIONT\n" + ] + } + ], + "execution_count": 87, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } + } + ], + "metadata": { + "kernelspec": { + "name": "synapse_spark", + "language": "Scala", + "display_name": "Synapse Spark" + }, + "language_info": { + "name": "scala" + }, + "kernel_info": { + "name": "synapse_spark" + }, + "save_output": true, + "synapse_widget": { + "version": "0.1", + "state": {} + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file From 2dbf2a8289e5b4f9f47473b28fbb500b64055e08 Mon Sep 17 00:00:00 2001 From: Kaiyue Zhou <70192016+kaiyuezhou@users.noreply.github.com> Date: Wed, 11 Aug 2021 16:25:38 +0800 Subject: [PATCH 07/25] Add Spark diagnostic emitter spark pool configuration samples (#112) * Add samples * update placeholder wording --- Diagnostic/SparkDiagnosticEmitter/README.md | 8 ++++++++ .../diagnostic-emitter-azure-event-hub-conf.txt | 5 +++++ ...iagnostic-emitter-azure-log-analytics-conf.txt | 6 ++++++ .../diagnostic-emitter-azure-storage-conf.txt | 6 ++++++ .../diagnostic-emitter-multi-destination-conf.txt | 15 +++++++++++++++ 5 files changed, 40 insertions(+) create mode 100644 Diagnostic/SparkDiagnosticEmitter/README.md create mode 100644 Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-azure-event-hub-conf.txt create mode 100644 Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-azure-log-analytics-conf.txt create mode 100644 Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-azure-storage-conf.txt create mode 100644 Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-multi-destination-conf.txt diff --git a/Diagnostic/SparkDiagnosticEmitter/README.md b/Diagnostic/SparkDiagnosticEmitter/README.md new file mode 100644 index 0000000..81d09c1 --- /dev/null +++ b/Diagnostic/SparkDiagnosticEmitter/README.md @@ -0,0 +1,8 @@ +# Synapse Spark Diagnostic Emitter Configuration Samples + +## Introduction + +The Azure Synapse Spark diagnostic emitter extension is a library which enables Spark application to emit the logs, event logs and metrics to one or more destinations, +including Azure Log Analytics, Azure Storage and Azure EventHub. + +The sample templates in this repo are designed to help users quickly enable this feature. \ No newline at end of file diff --git a/Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-azure-event-hub-conf.txt b/Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-azure-event-hub-conf.txt new file mode 100644 index 0000000..c21e979 --- /dev/null +++ b/Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-azure-event-hub-conf.txt @@ -0,0 +1,5 @@ +spark.synapse.diagnostic.emitters MyDestination1 +spark.synapse.diagnostic.emitter.MyDestination1.type AzureEventHub +spark.synapse.diagnostic.emitter.MyDestination1.categories Log,EventLog,Metrics +spark.synapse.diagnostic.emitter.MyDestination1.secret.keyVault +spark.synapse.diagnostic.emitter.MyDestination1.secret.keyVault.secretName \ No newline at end of file diff --git a/Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-azure-log-analytics-conf.txt b/Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-azure-log-analytics-conf.txt new file mode 100644 index 0000000..d94082c --- /dev/null +++ b/Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-azure-log-analytics-conf.txt @@ -0,0 +1,6 @@ +spark.synapse.diagnostic.emitters MyDestination1 +spark.synapse.diagnostic.emitter.MyDestination1.type AzureLogAnalytics +spark.synapse.diagnostic.emitter.MyDestination1.categories Log,EventLog,Metrics +spark.synapse.diagnostic.emitter.MyDestination1.workspaceId +spark.synapse.diagnostic.emitter.MyDestination1.secret.keyVault +spark.synapse.diagnostic.emitter.MyDestination1.secret.keyVault.secretName \ No newline at end of file diff --git a/Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-azure-storage-conf.txt b/Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-azure-storage-conf.txt new file mode 100644 index 0000000..1f0e815 --- /dev/null +++ b/Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-azure-storage-conf.txt @@ -0,0 +1,6 @@ +spark.synapse.diagnostic.emitters MyDestination1 +spark.synapse.diagnostic.emitter.MyDestination1.type AzureStorage +spark.synapse.diagnostic.emitter.MyDestination1.categories Log,EventLog,Metrics +spark.synapse.diagnostic.emitter.MyDestination1.uri https://.blob.core.windows.net// +spark.synapse.diagnostic.emitter.MyDestination1.auth AccessKey +spark.synapse.diagnostic.emitter.MyDestination1.secret \ No newline at end of file diff --git a/Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-multi-destination-conf.txt b/Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-multi-destination-conf.txt new file mode 100644 index 0000000..065ccda --- /dev/null +++ b/Diagnostic/SparkDiagnosticEmitter/diagnostic-emitter-multi-destination-conf.txt @@ -0,0 +1,15 @@ +spark.synapse.diagnostic.emitters MyDestination1,MyDestination2 + +spark.synapse.diagnostic.emitter.MyDestination1.type AzureStorage +spark.synapse.diagnostic.emitter.MyDestination1.categories Log,EventLog,Metrics +spark.synapse.diagnostic.emitter.MyDestination1.uri https://.blob.core.windows.net// +spark.synapse.diagnostic.emitter.MyDestination1.auth AccessKey +spark.synapse.diagnostic.emitter.MyDestination1.secret.keyVault +spark.synapse.diagnostic.emitter.MyDestination1.secret.keyVault.secretName + +spark.synapse.diagnostic.emitters MyDestination2 +spark.synapse.diagnostic.emitter.MyDestination2.type AzureLogAnalytics +spark.synapse.diagnostic.emitter.MyDestination2.categories Log,EventLog,Metrics +spark.synapse.diagnostic.emitter.MyDestination2.workspaceId +spark.synapse.diagnostic.emitter.MyDestination2.secret.keyVault +spark.synapse.diagnostic.emitter.MyDestination2.secret.keyVault.secretName \ No newline at end of file From f965305d762e81d08404200fe647920432a42175 Mon Sep 17 00:00:00 2001 From: Arthur Dooner Date: Wed, 18 Aug 2021 10:59:20 -0400 Subject: [PATCH 08/25] Cleaned up phrasing some, but all looks good! --- Notebooks/Scala/Migrate_snowflake_schema.ipynb | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Notebooks/Scala/Migrate_snowflake_schema.ipynb b/Notebooks/Scala/Migrate_snowflake_schema.ipynb index 8b1b5cf..3a921ab 100644 --- a/Notebooks/Scala/Migrate_snowflake_schema.ipynb +++ b/Notebooks/Scala/Migrate_snowflake_schema.ipynb @@ -7,12 +7,12 @@ "\r\n", "---\r\n", "\r\n", - "### This notebooks demos how to connect to Snowflake, read data from INFORMATION_SCHEMA, gather list of the tables for given schema and move those tables to Synapse SQL dedicated pool\r\n", + "### This notebooks demos how to connect to Snowflake, read data from a schema named INFORMATION_SCHEMA, gather the list of the tables for the given schema and move those tables to a Synapse SQL dedicated pool.\r\n", "\r\n", "
    \r\n", "
  • Define connection source
  • \r\n", "
  • Specify connection options for the Snowflake instance
  • \r\n", - "
  • Read Snowflake Information_schema.tables to compyle list of tables for our schema\r\n", + "
  • Read Snowflake Information_schema.tables to compile list of tables for our schema\r\n", "
      \r\n", "
    • Read each Snowflake table into into Spark DataFrame
    • \r\n", "
    • Write DataFrame to table to Synapse SQL Dedicated pool\r\n", @@ -113,7 +113,7 @@ { "cell_type": "code", "source": [ - "// To not expose snowflake credentials, it is a good practice to store user, password and account in Azure Key Vault service in your suscription\r\n", + "// To not expose snowflake credentials, it is best practice to store user, password and account in Azure Key Vault service in your suscription\r\n", "// Please note that you need to link your Azure Key Vault service (AKV) to your Synapse workspace \r\n", "// mssparkutils package let you retrive your secrets from AKV\r\n", "\r\n", @@ -224,7 +224,7 @@ { "cell_type": "code", "source": [ - "// set up options to connect to schema INFOrMATION_SCHEMA. That schema in Snowflake contain your database metadata\r\n", + "// Setup options to connect to schema INFORMATION_SCHEMA. That schema in Snowflake contains your database metadata\r\n", " \r\n", "val sfoptions1 = Map( \r\n", " \"sfUrl\" -> account_URL,\r\n", @@ -280,7 +280,7 @@ { "cell_type": "code", "source": [ - "// read table INFORMATION_SCHEMA.TABLES into DataFrame. We need it to compile list of the tables within our schema\r\n", + "// read table INFORMATION_SCHEMA.TABLES into a DataFrame. We need it to compile list of the tables within our schema\r\n", "\r\n", "val df_tl=spark.read.format( SNOWFLAKE_SOURCE_NAME ).options(sfoptions1).option(\"dbtable\",\"TABLES\").load()\r\n", "//display(df_tl)" @@ -330,7 +330,7 @@ { "cell_type": "code", "source": [ - "// For easy iteration convert selected info from DataFrame to collection\r\n", + "// For easy iteration, convert selected info from DataFrame to collection\r\n", "val df_tab_list = df_tl.select(\"table_schema\", \"table_name\").filter(\"table_schema='PUBLIC'\").collect()\r\n", "//println(df_tab_list)" ], @@ -379,7 +379,7 @@ { "cell_type": "code", "source": [ - "// For each table in the schema read data from Snowflake table into data frame and write it to Synapse SQL Dedicated Pool\r\n", + "// For each table in the schema read data from Snowflake table into a DataFrame and write it to Synapse SQL Dedicated Pool.\r\n", "\r\n", "df_tab_list.foreach(row=>\r\n", " {\r\n", @@ -456,4 +456,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} From 0102b21b831c696e26f09753b2e988ec5baa940f Mon Sep 17 00:00:00 2001 From: Arthur Dooner Date: Wed, 18 Aug 2021 11:14:25 -0400 Subject: [PATCH 09/25] Removed an output --- Notebooks/Scala/Migrate_snowflake_schema.ipynb | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/Notebooks/Scala/Migrate_snowflake_schema.ipynb b/Notebooks/Scala/Migrate_snowflake_schema.ipynb index 3a921ab..2d1f8d9 100644 --- a/Notebooks/Scala/Migrate_snowflake_schema.ipynb +++ b/Notebooks/Scala/Migrate_snowflake_schema.ipynb @@ -59,16 +59,6 @@ "text/plain": "StatementMeta(DemoCluster, 17, 1, Finished, Available)" }, "metadata": {} - }, - { - "output_type": "error", - "ename": "Error", - "evalue": ":32: error: object snowflake is not a member of package net", - "traceback": [ - "Error: :32: error: object snowflake is not a member of package net", - " import net.snowflake.spark.snowflake.Utils.SNOWFLAKE_SOURCE_NAME\n", - " ^\n" - ] } ], "execution_count": 1, From 7ca8e63c8ef5fdc8e1cfa4ddf58d0889e3034bdf Mon Sep 17 00:00:00 2001 From: Arthur Dooner Date: Wed, 18 Aug 2021 11:15:08 -0400 Subject: [PATCH 10/25] Removed output for credential location --- Notebooks/Scala/Migrate_snowflake_schema.ipynb | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/Notebooks/Scala/Migrate_snowflake_schema.ipynb b/Notebooks/Scala/Migrate_snowflake_schema.ipynb index 2d1f8d9..f3e2a79 100644 --- a/Notebooks/Scala/Migrate_snowflake_schema.ipynb +++ b/Notebooks/Scala/Migrate_snowflake_schema.ipynb @@ -130,16 +130,6 @@ "text/plain": "StatementMeta(TestS3Cluster, 18, 2, Finished, Available)" }, "metadata": {} - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "user: String = NKADAVIL07\n", - "password: String = Ashburn07\n", - "account: String = yg58220.east-us-2\n", - "account_URL: String = https://yg58220.east-us-2.azure.snowflakecomputing.com\n" - ] } ], "execution_count": 79, From 183321dd7de4bfd09dfd66c7b0df6952f489c8da Mon Sep 17 00:00:00 2001 From: Arthur Dooner Date: Wed, 18 Aug 2021 11:16:16 -0400 Subject: [PATCH 11/25] Removed last credential --- Notebooks/Scala/Migrate_snowflake_schema.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Notebooks/Scala/Migrate_snowflake_schema.ipynb b/Notebooks/Scala/Migrate_snowflake_schema.ipynb index f3e2a79..e4ee055 100644 --- a/Notebooks/Scala/Migrate_snowflake_schema.ipynb +++ b/Notebooks/Scala/Migrate_snowflake_schema.ipynb @@ -183,7 +183,7 @@ "output_type": "stream", "name": "stdout", "text": [ - "sfoptions: scala.collection.immutable.Map[String,String] = Map(sfUrl -> https://yg58220.east-us-2.azure.snowflakecomputing.com, sfSchema -> PUBLIC, sfPassword -> Ashburn07, sfUser -> NKADAVIL07, sfWarehouse -> COMPUTE_WH, sfDatabase -> TESTDB)\n" + "sfoptions: scala.collection.immutable.Map[String,String] = Map(sfUrl -> https://yg58220.east-us-2.azure.snowflakecomputing.com, sfSchema -> PUBLIC, sfPassword -> , sfUser -> , sfWarehouse -> , sfDatabase -> TESTDB)\n" ] } ], From 54650447f8757891d91f460356f964119184ade2 Mon Sep 17 00:00:00 2001 From: Arthur Dooner Date: Wed, 18 Aug 2021 11:17:03 -0400 Subject: [PATCH 12/25] Final cleanup on credentials --- Notebooks/Scala/Migrate_snowflake_schema.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Notebooks/Scala/Migrate_snowflake_schema.ipynb b/Notebooks/Scala/Migrate_snowflake_schema.ipynb index e4ee055..7de6de2 100644 --- a/Notebooks/Scala/Migrate_snowflake_schema.ipynb +++ b/Notebooks/Scala/Migrate_snowflake_schema.ipynb @@ -238,7 +238,7 @@ "output_type": "stream", "name": "stdout", "text": [ - "sfoptions1: scala.collection.immutable.Map[String,String] = Map(sfUrl -> https://yg58220.east-us-2.azure.snowflakecomputing.com, sfSchema -> INFORMATION_SCHEMA, sfPassword -> Ashburn07, sfUser -> NKADAVIL07, sfWarehouse -> COMPUTE_WH, sfDatabase -> TESTDB)\n" + "sfoptions1: scala.collection.immutable.Map[String,String] = Map(sfUrl -> https://yg58220.east-us-2.azure.snowflakecomputing.com, sfSchema -> INFORMATION_SCHEMA, sfPassword -> , sfUser -> , sfWarehouse -> , sfDatabase -> TESTDB)\n" ] } ], From d7987eb62c339362ea6423676088eb9b3e68e625 Mon Sep 17 00:00:00 2001 From: Marina Levin <80341311+mlevin19@users.noreply.github.com> Date: Wed, 18 Aug 2021 11:25:40 -0400 Subject: [PATCH 13/25] Updated with comments regarding stored proc --- .../Scala/Migrate_snowflake_schema.ipynb | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/Notebooks/Scala/Migrate_snowflake_schema.ipynb b/Notebooks/Scala/Migrate_snowflake_schema.ipynb index 7de6de2..ed6319a 100644 --- a/Notebooks/Scala/Migrate_snowflake_schema.ipynb +++ b/Notebooks/Scala/Migrate_snowflake_schema.ipynb @@ -356,6 +356,57 @@ "collapsed": true } }, + { + "cell_type": "markdown", + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "### Note: \r\n", + "We are using df.write.synapsesql method to populate table in SQL dedicated pool. If your target schem anything but \"dbo\", it need to be exist before.\r\n", + "At the same time this target schem ashould not have the table with name specified in this method. Here is stored procedure you can run to make sure that this requirement is met:\r\n", + "\r\n", + "```sql\r\n", + "CREATE PROCEDURE set_sfschema @schemaname sysname\r\n", + "AS BEGIN\r\n", + " DECLARE @cr_stmt NVARCHAR(200) = N'CREATE SCHEMA ' + @schemaname; \r\n", + " -- to imulate cursor processing\r\n", + " CREATE TABLE #temp_tbl\r\n", + " WITH\r\n", + " ( DISTRIBUTION = ROUND_ROBIN\r\n", + " )\r\n", + " AS \r\n", + " SELECT ROW_NUMBER() OVER(ORDER BY (SELECT NULL)) AS Sequence,\r\n", + " table_schema , table_name ,\r\n", + " 'DROP TABLE ' + quotename(table_schema) + '.' + quotename(table_name) as sql_code\r\n", + " from information_schema.tables WHERE table_schema = @schemaname ; \r\n", + " \r\n", + " DECLARE @nbr_statements INT = (SELECT COUNT(*) FROM #temp_tbl)\r\n", + " , @i INT = 1;\r\n", + "\r\n", + " IF (0 = (SELECT COUNT(*) FROM sys.schemas WHERE name = @schemaname))\r\n", + " BEGIN\r\n", + " EXEC sp_executesql @tsql = @cr_stmt;\r\n", + " END\r\n", + " ELSE \r\n", + " WHILE @i <= @nbr_statements\r\n", + " BEGIN\r\n", + " DECLARE @sql_code NVARCHAR(60) = (SELECT sql_code FROM #temp_tbl WHERE Sequence =@i);\r\n", + " EXEC sp_executesql @sql_code;\r\n", + " SET @i +=1;\r\n", + " END\r\n", + " DROP TABLE #temp_tbl; \r\n", + "END\r\n", + "GO\r\n", + "```\r\n", + "\r\n", + "" + ] + }, { "cell_type": "code", "source": [ From 9866d5c6c4f3b700c4a851de1f514f9b8869d429 Mon Sep 17 00:00:00 2001 From: Marina Levin <80341311+mlevin19@users.noreply.github.com> Date: Wed, 18 Aug 2021 11:27:04 -0400 Subject: [PATCH 14/25] fixed some typos --- Notebooks/Scala/Migrate_snowflake_schema.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Notebooks/Scala/Migrate_snowflake_schema.ipynb b/Notebooks/Scala/Migrate_snowflake_schema.ipynb index ed6319a..0346b15 100644 --- a/Notebooks/Scala/Migrate_snowflake_schema.ipynb +++ b/Notebooks/Scala/Migrate_snowflake_schema.ipynb @@ -367,8 +367,8 @@ }, "source": [ "### Note: \r\n", - "We are using df.write.synapsesql method to populate table in SQL dedicated pool. If your target schem anything but \"dbo\", it need to be exist before.\r\n", - "At the same time this target schem ashould not have the table with name specified in this method. Here is stored procedure you can run to make sure that this requirement is met:\r\n", + "We are using df.write.synapsesql method to populate table in SQL dedicated pool. If your target schema anything but \"dbo\", it need to be exist before.\r\n", + "At the same time this target schema should not have the table with name specified in this method. Here is stored procedure you can run to make sure that this requirement is met:\r\n", "\r\n", "```sql\r\n", "CREATE PROCEDURE set_sfschema @schemaname sysname\r\n", From dfd392b266ff6f8b61cd7f24fceafd6b46ca8ceb Mon Sep 17 00:00:00 2001 From: "Jovan Popovic (MSFT)" Date: Fri, 20 Aug 2021 10:47:06 +0200 Subject: [PATCH 15/25] Create generate-cosmosdb-query-ff-schema.html --- .../generate-cosmosdb-query-ff-schema.html | 225 ++++++++++++++++++ 1 file changed, 225 insertions(+) create mode 100644 docs/synapse-sql/generate-cosmosdb-query-ff-schema.html diff --git a/docs/synapse-sql/generate-cosmosdb-query-ff-schema.html b/docs/synapse-sql/generate-cosmosdb-query-ff-schema.html new file mode 100644 index 0000000..2a43862 --- /dev/null +++ b/docs/synapse-sql/generate-cosmosdb-query-ff-schema.html @@ -0,0 +1,225 @@ + + + + + + OPENROWSET by example - Synapse link for CosmosDB + + + + +
      +

      Generating OPENROWSET statement for Cosmos DB documents

      +
      + +
      + +
      +
      +
      + +
      + + + +
      +
      + +
      +

      Copy the text below in some SQL editor once you generate the script

      +
      + +
      + + + + + + From f7b7d50e7b3d2a8ff089968abb2951a6f22de658 Mon Sep 17 00:00:00 2001 From: Rodrigo Souza Date: Fri, 27 Aug 2021 13:55:55 -0700 Subject: [PATCH 16/25] score fix --- .../pyspark/2SalesForecastingWithAML.ipynb | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/Notebooks/PySpark/Synapse Link for Cosmos DB samples/Retail/spark-notebooks/pyspark/2SalesForecastingWithAML.ipynb b/Notebooks/PySpark/Synapse Link for Cosmos DB samples/Retail/spark-notebooks/pyspark/2SalesForecastingWithAML.ipynb index f79e97c..7030b59 100644 --- a/Notebooks/PySpark/Synapse Link for Cosmos DB samples/Retail/spark-notebooks/pyspark/2SalesForecastingWithAML.ipynb +++ b/Notebooks/PySpark/Synapse Link for Cosmos DB samples/Retail/spark-notebooks/pyspark/2SalesForecastingWithAML.ipynb @@ -444,16 +444,15 @@ "\n", "df_all = align_outputs(y_predictions, X_trans, X_test, y_test, target_column_name)\n", "\n", - "from azureml.automl.core._vendor.automl.client.core.common import metrics\n", + "from azureml.automl.runtime.shared.score import scoring\n", "from matplotlib import pyplot as plt\n", "from automl.client.core.common import constants\n", "\n", - "# use automl metrics module\n", - "scores = metrics.compute_metrics_regression(\n", - " df_all['predicted'],\n", - " df_all[target_column_name],\n", - " list(constants.Metric.SCALAR_REGRESSION_SET),\n", - " None, None, None)\n", + "# use automl scoring module\n", + "scores = scoring.score_regression(\n", + " y_test=df_all[target_column_name],\n", + " y_pred=df_all['predicted'],\n", + " metrics=list(constants.Metric.SCALAR_REGRESSION_SET))\n", "\n", "print(\"[Test data scores]\\n\")\n", "for key, value in scores.items(): \n", From 0e1bdff5f7ef5000d99478e5860c851e83bfe784 Mon Sep 17 00:00:00 2001 From: Marina Levin <80341311+mlevin19@users.noreply.github.com> Date: Wed, 1 Sep 2021 09:26:57 -0400 Subject: [PATCH 17/25] Update Migrate_snowflake_schema.ipynb --- Notebooks/Scala/Migrate_snowflake_schema.ipynb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Notebooks/Scala/Migrate_snowflake_schema.ipynb b/Notebooks/Scala/Migrate_snowflake_schema.ipynb index 0346b15..6b918d6 100644 --- a/Notebooks/Scala/Migrate_snowflake_schema.ipynb +++ b/Notebooks/Scala/Migrate_snowflake_schema.ipynb @@ -37,7 +37,8 @@ "source": [ "// To use Snowflake as a data source in Spark, use the .format option to provide the Snowflake connector class name that defines the data source.\r\n", "// Please note that you need to add spark-snowflake_2.12-2.9.0-spark_3.1.jar and snowflake-jdbc-3.13.6.jar to workspace packages as well as to cluster/session packages\r\n", - "\r\n", + "// You can download those jar files from https://mvnrepository.com/artifact/net.snowflake/spark-snowflake?repo=snowflakedb and https://repo1.maven.org/maven2/net/snowflake/snowflake-jdbc/ respectevly\r\n", + "\r\n", "import net.snowflake.spark.snowflake.Utils.SNOWFLAKE_SOURCE_NAME\r\n", "val SNOWFLAKE_SOURCE_NAME = \"net.snowflake.spark.snowflake\"" ], From bf5a423321ddc440d160aaa2096056d1781d73eb Mon Sep 17 00:00:00 2001 From: Marina Levin <80341311+mlevin19@users.noreply.github.com> Date: Wed, 1 Sep 2021 09:29:32 -0400 Subject: [PATCH 18/25] Update Migrate_snowflake_schema.ipynb added downloads links to cell1 --- Notebooks/Scala/Migrate_snowflake_schema.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Notebooks/Scala/Migrate_snowflake_schema.ipynb b/Notebooks/Scala/Migrate_snowflake_schema.ipynb index 6b918d6..9d05574 100644 --- a/Notebooks/Scala/Migrate_snowflake_schema.ipynb +++ b/Notebooks/Scala/Migrate_snowflake_schema.ipynb @@ -37,7 +37,7 @@ "source": [ "// To use Snowflake as a data source in Spark, use the .format option to provide the Snowflake connector class name that defines the data source.\r\n", "// Please note that you need to add spark-snowflake_2.12-2.9.0-spark_3.1.jar and snowflake-jdbc-3.13.6.jar to workspace packages as well as to cluster/session packages\r\n", - "// You can download those jar files from https://mvnrepository.com/artifact/net.snowflake/spark-snowflake?repo=snowflakedb and https://repo1.maven.org/maven2/net/snowflake/snowflake-jdbc/ respectevly\r\n", + "// You can download those jar files from https://mvnrepository.com/artifact/net.snowflake/spark-snowflake?repo=snowflakedb and https://repo1.maven.org/maven2/net/snowflake/snowflake-jdbc respectevly\r\n", "\r\n", "import net.snowflake.spark.snowflake.Utils.SNOWFLAKE_SOURCE_NAME\r\n", "val SNOWFLAKE_SOURCE_NAME = \"net.snowflake.spark.snowflake\"" From cb1e62e888a12604a667adf3007894fe8dd0f8e4 Mon Sep 17 00:00:00 2001 From: Marina Levin <80341311+mlevin19@users.noreply.github.com> Date: Wed, 1 Sep 2021 10:03:08 -0400 Subject: [PATCH 19/25] Update Migrate_snowflake_schema.ipynb added links for Azure Key Vault configuration --- Notebooks/Scala/Migrate_snowflake_schema.ipynb | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/Notebooks/Scala/Migrate_snowflake_schema.ipynb b/Notebooks/Scala/Migrate_snowflake_schema.ipynb index 9d05574..3003d82 100644 --- a/Notebooks/Scala/Migrate_snowflake_schema.ipynb +++ b/Notebooks/Scala/Migrate_snowflake_schema.ipynb @@ -37,7 +37,8 @@ "source": [ "// To use Snowflake as a data source in Spark, use the .format option to provide the Snowflake connector class name that defines the data source.\r\n", "// Please note that you need to add spark-snowflake_2.12-2.9.0-spark_3.1.jar and snowflake-jdbc-3.13.6.jar to workspace packages as well as to cluster/session packages\r\n", - "// You can download those jar files from https://mvnrepository.com/artifact/net.snowflake/spark-snowflake?repo=snowflakedb and https://repo1.maven.org/maven2/net/snowflake/snowflake-jdbc respectevly\r\n", + "// You can download those jar files from https://mvnrepository.com/artifact/net.snowflake/spark-snowflake?repo=snowflakedb and https://repo1.maven.org/maven2/net/snowflake/snowflake-jdbc respectevly\r\n", + "// You can find instructions how to add customized jars to cluster/session packages at https://docs.microsoft.com/en-us/azure/synapse-analytics/spark/apache-spark-manage-scala-packages\r\n", "\r\n", "import net.snowflake.spark.snowflake.Utils.SNOWFLAKE_SOURCE_NAME\r\n", "val SNOWFLAKE_SOURCE_NAME = \"net.snowflake.spark.snowflake\"" @@ -105,12 +106,13 @@ "cell_type": "code", "source": [ "// To not expose snowflake credentials, it is best practice to store user, password and account in Azure Key Vault service in your suscription\r\n", + "// Please reference https://docs.microsoft.com/en-us/azure/data-factory/store-credentials-in-key-vault how to set up secrets in Azure Key Vault service\r\n", "// Please note that you need to link your Azure Key Vault service (AKV) to your Synapse workspace \r\n", "// mssparkutils package let you retrive your secrets from AKV\r\n", "\r\n", - "val user = mssparkutils.credentials.getSecret(\"azkey2021\", \"sfuser\",\"AzureKeyV1\")\r\n", - "val password = mssparkutils.credentials.getSecret(\"azkey2021\", \"sfpassword\",\"AzureKeyV1\")\r\n", - "val account = mssparkutils.credentials.getSecret(\"azkey2021\", \"sfaccount\",\"AzureKeyV1\")\r\n", + "val user = mssparkutils.credentials.getSecret(\"Azure Key Vault name \", \"secret name for user\",\"linked service name\")\r\n", + "val password = mssparkutils.credentials.getSecret(\"Azure Key Vault name \", \"secret name for password\",\"linked service name\")\r\n", + "val account = mssparkutils.credentials.getSecret(\"Azure Key Vault name \", \"secret name for account\",\"linked service name\")\r\n", "val account_URL = \"https://\" + account + \".azure.snowflakecomputing.com\"" ], "outputs": [ From d076bcfe5cb99fcf96789366a234682dfeb40032 Mon Sep 17 00:00:00 2001 From: Marina Levin <80341311+mlevin19@users.noreply.github.com> Date: Wed, 1 Sep 2021 10:06:14 -0400 Subject: [PATCH 20/25] Update Migrate_snowflake_schema.ipynb changed name of schema for testing --- Notebooks/Scala/Migrate_snowflake_schema.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Notebooks/Scala/Migrate_snowflake_schema.ipynb b/Notebooks/Scala/Migrate_snowflake_schema.ipynb index 3003d82..b2e6c7f 100644 --- a/Notebooks/Scala/Migrate_snowflake_schema.ipynb +++ b/Notebooks/Scala/Migrate_snowflake_schema.ipynb @@ -71,7 +71,7 @@ "source": [ "// setting default paramter \r\n", "\r\n", - "val sfschema=\"test1\"" + "val sfschema=\"existing schema\"" ], "outputs": [], "execution_count": null, From 8965326b386ad25456f92d85851910e2489385bc Mon Sep 17 00:00:00 2001 From: lilijing Date: Mon, 13 Sep 2021 12:44:50 +0800 Subject: [PATCH 21/25] Add metadata file for testing --- Notebooks/metadata.json | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 Notebooks/metadata.json diff --git a/Notebooks/metadata.json b/Notebooks/metadata.json new file mode 100644 index 0000000..9a03d65 --- /dev/null +++ b/Notebooks/metadata.json @@ -0,0 +1,22 @@ +{ + "data": [ + { + "title": "Charting in Synapse Notebook", + "path": "PySpark\\06 Charting in Synapse Notebook.ipynb", + "description": "This notebook provides examples to visualize data in Synapse notebook using matplotlib, bokeh, or seaborn.", + "tags": ["Matplotlib", "Bokeh", "Seaborn", "Visualization", "Charting"], + "types": ["PySpark"], + "categories": ["Visualization"], + "languages": ["PySpark"] + }, + { + "title": "Getting Started with Hyperspace Indexing", + "path": "PySpark\\Hitchhikers Guide to Hyperspace - Python.ipynb", + "description": "This notebook shows you a tour of using Hyperspace indexing and how you could accelerate your Apache Spark workloads. It starts with an explanation of what indexing is and dives into how you can create, use and maintain indexes. In addition, you will also learn how to look at query plans and understand whether indexing is being utilized by Apache Spark.", + "tags": ["Spark", "Hyperspace Indexing", "Python", "PySpark", "Incremental", "Hybrid-Scan", "Accelerate", "Join", "Filter", "Acceleration", "Fast", "Parquet", "JSON", "CSV", "Workload"], + "types": ["PySpark"], + "categories": ["Querying"], + "languages": ["PySpark"] + } + ] +} From 68058f64bb6867f85ae40fc95cba0eb55c365787 Mon Sep 17 00:00:00 2001 From: lilijing Date: Mon, 13 Sep 2021 14:36:58 +0800 Subject: [PATCH 22/25] Test full pipeline --- Notebooks/metadata.json | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Notebooks/metadata.json b/Notebooks/metadata.json index 9a03d65..b26662f 100644 --- a/Notebooks/metadata.json +++ b/Notebooks/metadata.json @@ -17,6 +17,15 @@ "types": ["PySpark"], "categories": ["Querying"], "languages": ["PySpark"] + }, + { + "title": "Using Azure Open Datasets in Synapse", + "path": "PySpark\\05 Using Azure Open Datasets in Synapse.ipynb", + "description": "This notebook provides examples of how to enrich NYC Green Taxi Data with Holiday and Weather.", + "tags": ["Matplotlib", "Bokeh", "Seaborn", "Visualization", "Charting"], + "types": ["PySpark"], + "categories": ["Data preparation"], + "languages": ["PySpark"] } ] } From 9c361e9ebc3108e0389ea03117efec6611465fa8 Mon Sep 17 00:00:00 2001 From: lilijing Date: Mon, 13 Sep 2021 15:45:04 +0800 Subject: [PATCH 23/25] Remove invalid characters in the notebook --- Notebooks/PySpark/05 Using Azure Open Datasets in Synapse.ipynb | 2 +- ...n and ML Modeling - NYC taxi predict using Spark MLlib.ipynb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Notebooks/PySpark/05 Using Azure Open Datasets in Synapse.ipynb b/Notebooks/PySpark/05 Using Azure Open Datasets in Synapse.ipynb index 4c356da..b3fda51 100644 --- a/Notebooks/PySpark/05 Using Azure Open Datasets in Synapse.ipynb +++ b/Notebooks/PySpark/05 Using Azure Open Datasets in Synapse.ipynb @@ -270,7 +270,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Enrich with weather data¶\n", + "## Enrich with weather data\n", "\n", "Now we append NOAA surface weather data to the taxi and holiday data. Use a similar approach to fetch the [NOAA weather history data](https://azure.microsoft.com/en-us/services/open-datasets/catalog/noaa-integrated-surface-data/) from Azure Open Datasets. " ], diff --git a/Notebooks/PySpark/07 Data Exploration and ML Modeling - NYC taxi predict using Spark MLlib.ipynb b/Notebooks/PySpark/07 Data Exploration and ML Modeling - NYC taxi predict using Spark MLlib.ipynb index 24bca0c..80b99f9 100644 --- a/Notebooks/PySpark/07 Data Exploration and ML Modeling - NYC taxi predict using Spark MLlib.ipynb +++ b/Notebooks/PySpark/07 Data Exploration and ML Modeling - NYC taxi predict using Spark MLlib.ipynb @@ -48,7 +48,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Ingest Data¶ \n", + "## Ingest Data\n", "\n", "Get a sample data of nyc yellow taxi to make it faster/easier to evaluate different approaches to prep for the modelling phase later in the notebook." ], From 681caa3410a86477498716b1243f27426409b0fb Mon Sep 17 00:00:00 2001 From: lilijing Date: Wed, 22 Sep 2021 14:17:34 +0800 Subject: [PATCH 24/25] Example about a notebook with parameter --- Notebooks/PySpark/exampleForParameter.ipynb | 107 ++++++++++++++++++ .../parameters/exampleForParameter.json | 5 + 2 files changed, 112 insertions(+) create mode 100644 Notebooks/PySpark/exampleForParameter.ipynb create mode 100644 Notebooks/PySpark/parameters/exampleForParameter.json diff --git a/Notebooks/PySpark/exampleForParameter.ipynb b/Notebooks/PySpark/exampleForParameter.ipynb new file mode 100644 index 0000000..4b565b1 --- /dev/null +++ b/Notebooks/PySpark/exampleForParameter.ipynb @@ -0,0 +1,107 @@ +{ + "cells": [ + { + "cell_type": "code", + "source": [ + "account_name = '' # fill in your primary account name\r\n", + "container_name = '' # fill in your container name\r\n", + "relative_path = '' # fill in your relative folder path" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.livy.statement-meta+json": { + "spark_pool": "sparkpool01", + "session_id": 0, + "statement_id": 3, + "state": "finished", + "livy_statement_state": "available", + "queued_time": "2021-09-17T06:08:57.5750046Z", + "session_start_time": null, + "execution_start_time": "2021-09-17T06:08:57.6725894Z", + "execution_finish_time": "2021-09-17T06:08:57.6728166Z" + }, + "text/plain": "StatementMeta(sparkpool01, 0, 3, Finished, Available)" + }, + "metadata": {} + } + ], + "execution_count": 3, + "metadata": { + "tags": [ + "parameters" + ] + } + }, + { + "cell_type": "code", + "source": [ + "from pyspark.sql import SparkSession\r\n", + "from pyspark.sql.types import *\r\n", + "\r\n", + "adls_path = 'abfss://%s@%s.dfs.core.windows.net/%s' % (container_name, account_name, relative_path)\r\n", + "print('Primary storage account path: ' + adls_path)" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.livy.statement-meta+json": { + "spark_pool": "sparkpool01", + "session_id": 0, + "statement_id": 4, + "state": "finished", + "livy_statement_state": "available", + "queued_time": "2021-09-17T06:09:00.1736428Z", + "session_start_time": null, + "execution_start_time": "2021-09-17T06:09:00.273459Z", + "execution_finish_time": "2021-09-17T06:09:00.4351164Z" + }, + "text/plain": "StatementMeta(sparkpool01, 0, 4, Finished, Available)" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Primary storage account path: abfss://test0121@lilijing0227neur.dfs.core.windows.net/test01/" + ] + } + ], + "execution_count": 4, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + } + } + } + ], + "metadata": { + "language_info": { + "name": "python" + }, + "kernelspec": { + "name": "synapse_pyspark", + "language": "Python", + "display_name": "Synapse PySpark" + }, + "kernel_info": { + "name": "synapse_pyspark" + }, + "save_output": true, + "synapse_widget": { + "version": "0.1", + "state": {} + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/Notebooks/PySpark/parameters/exampleForParameter.json b/Notebooks/PySpark/parameters/exampleForParameter.json new file mode 100644 index 0000000..308ca6d --- /dev/null +++ b/Notebooks/PySpark/parameters/exampleForParameter.json @@ -0,0 +1,5 @@ +{ + "Your primary storage account name": "nbsampletestworkspace01", + "Your container name": "test0922", + "Your relative path": "testCSV/" +} \ No newline at end of file From 194d473a414a88ac9e163172c0ecae7a0ac5abe3 Mon Sep 17 00:00:00 2001 From: lilijing Date: Wed, 22 Sep 2021 15:41:52 +0800 Subject: [PATCH 25/25] Add example in metadata.json --- Notebooks/metadata.json | 28 +++++----------------------- 1 file changed, 5 insertions(+), 23 deletions(-) diff --git a/Notebooks/metadata.json b/Notebooks/metadata.json index b26662f..4c6871c 100644 --- a/Notebooks/metadata.json +++ b/Notebooks/metadata.json @@ -1,30 +1,12 @@ { "data": [ { - "title": "Charting in Synapse Notebook", - "path": "PySpark\\06 Charting in Synapse Notebook.ipynb", - "description": "This notebook provides examples to visualize data in Synapse notebook using matplotlib, bokeh, or seaborn.", - "tags": ["Matplotlib", "Bokeh", "Seaborn", "Visualization", "Charting"], + "title": "exampleForParameter", + "path": "PySpark\\exampleForParameter.ipynb", + "description": "This notebook provides examples to show how to handle parameters for automation test.", + "tags": ["Example"], "types": ["PySpark"], - "categories": ["Visualization"], - "languages": ["PySpark"] - }, - { - "title": "Getting Started with Hyperspace Indexing", - "path": "PySpark\\Hitchhikers Guide to Hyperspace - Python.ipynb", - "description": "This notebook shows you a tour of using Hyperspace indexing and how you could accelerate your Apache Spark workloads. It starts with an explanation of what indexing is and dives into how you can create, use and maintain indexes. In addition, you will also learn how to look at query plans and understand whether indexing is being utilized by Apache Spark.", - "tags": ["Spark", "Hyperspace Indexing", "Python", "PySpark", "Incremental", "Hybrid-Scan", "Accelerate", "Join", "Filter", "Acceleration", "Fast", "Parquet", "JSON", "CSV", "Workload"], - "types": ["PySpark"], - "categories": ["Querying"], - "languages": ["PySpark"] - }, - { - "title": "Using Azure Open Datasets in Synapse", - "path": "PySpark\\05 Using Azure Open Datasets in Synapse.ipynb", - "description": "This notebook provides examples of how to enrich NYC Green Taxi Data with Holiday and Weather.", - "tags": ["Matplotlib", "Bokeh", "Seaborn", "Visualization", "Charting"], - "types": ["PySpark"], - "categories": ["Data preparation"], + "categories": ["Example"], "languages": ["PySpark"] } ]