diff --git a/language-extensions/dotnet-core-CSharp/README.md b/language-extensions/dotnet-core-CSharp/README.md index 5053fd1..3439c07 100644 --- a/language-extensions/dotnet-core-CSharp/README.md +++ b/language-extensions/dotnet-core-CSharp/README.md @@ -50,3 +50,59 @@ Not Supported. After downloading or building the dotnet-core-CSharp-lang-extension.zip, use [CREATE EXTERNAL LANGUAGE](https://docs.microsoft.com/en-us/sql/t-sql/statements/create-external-language-transact-sql?view=sql-server-ver15) to register the language with SQL Server 2019 CU3+. This [tutorial](./sample/regex/README.md) will walk you through an end to end sample using the .NET Core C# language extension. + +## Output Schema Support + +By default, output column types are inferred from the .NET DataFrame column types. For string columns, you can explicitly specify the SQL data type using the `OutputColumnDataTypes` property. + +### Specifying Output Column Types + +Use `OutputColumnDataTypes` to specify the SQL data type for output columns by name: + +```csharp +using Microsoft.SqlServer.CSharpExtension.SDK; +using Microsoft.Data.Analysis; +using static Microsoft.SqlServer.CSharpExtension.Sql; + +public class MyExecutor : AbstractSqlServerExtensionExecutor +{ + public override DataFrame Execute(DataFrame input, Dictionary sqlParams) + { + // Specify NVARCHAR (UTF-16) output for a string column + OutputColumnDataTypes["unicode_column"] = SqlDataType.DotNetWChar; + + // Process and return data + return resultDataFrame; + } +} +``` + +### Supported String Types + +| SqlDataType | SQL Type | Encoding | Description | +|-------------|----------|----------|-------------| +| `SqlDataType.DotNetChar` | VARCHAR | UTF-8 | Default for string columns | +| `SqlDataType.DotNetWChar` | NVARCHAR | UTF-16 | Use for Unicode data | + +### Example: Mixed VARCHAR and NVARCHAR Output + +```csharp +public class MixedOutputExecutor : AbstractSqlServerExtensionExecutor +{ + public override DataFrame Execute(DataFrame input, Dictionary sqlParams) + { + // "ascii_col" will default to VARCHAR (no configuration needed) + + // "unicode_col" should be NVARCHAR + OutputColumnDataTypes["unicode_col"] = SqlDataType.DotNetWChar; + + return input; + } +} +``` + +### Default Behavior + +If no explicit type is specified for a string column: +- String columns default to `DotNetChar` (VARCHAR/UTF-8) +- Numeric and other types are automatically mapped from their .NET types diff --git a/language-extensions/dotnet-core-CSharp/sample/regex/pkg/RegexSample.cs b/language-extensions/dotnet-core-CSharp/sample/regex/pkg/RegexSample.cs index 7dd4e13..b0015cc 100644 --- a/language-extensions/dotnet-core-CSharp/sample/regex/pkg/RegexSample.cs +++ b/language-extensions/dotnet-core-CSharp/sample/regex/pkg/RegexSample.cs @@ -14,6 +14,7 @@ using Microsoft.Data.Analysis; using Microsoft.SqlServer.CSharpExtension.SDK; using System.Text.RegularExpressions; +using static Microsoft.SqlServer.CSharpExtension.Sql; namespace UserExecutor { @@ -60,6 +61,11 @@ public override DataFrame Execute(DataFrame input, Dictionary s sqlParams["@rowsCount"] = output.Rows.Count; sqlParams["@regexExpr"] = "Success!"; + // Optionally, specify that "text" column should be output as NVARCHAR (UTF-16) + // instead of the default VARCHAR (UTF-8). Uncomment the line below to enable: + // + // OutputColumnDataTypes["text"] = SqlDataType.DotNetWChar; + // Return output dataset as a DataFrame // return output; diff --git a/language-extensions/dotnet-core-CSharp/src/managed/CSharpOutputDataSet.cs b/language-extensions/dotnet-core-CSharp/src/managed/CSharpOutputDataSet.cs index 8eee4b1..93138cd 100644 --- a/language-extensions/dotnet-core-CSharp/src/managed/CSharpOutputDataSet.cs +++ b/language-extensions/dotnet-core-CSharp/src/managed/CSharpOutputDataSet.cs @@ -45,19 +45,42 @@ public class CSharpOutputDataSet: CSharpDataSet /// by extracting data and information from every DataFrameColumn. /// /// The DataFrame containing the output data. - public unsafe void ExtractColumns(DataFrame dataFrame) + /// Optional user-specified column data types by name. + public unsafe void ExtractColumns( + DataFrame dataFrame, + Dictionary outputColumnDataTypes = null) { Logging.Trace("CSharpOutputDataSet::ExtractColumns"); _strLenOrNullMapPtrs = new IntPtr[ColumnsNumber]; _dataPtrs = new IntPtr[ColumnsNumber]; + for(ushort columnNumber = 0; columnNumber < ColumnsNumber; ++columnNumber) { DataFrameColumn column = dataFrame.Columns[columnNumber]; // Determine the SQL data type for this column. - // All .NET strings are output as DotNetChar (varchar/UTF-8). + // Default behavior: map .NET types to SQL types (strings -> DotNetChar/varchar). // SqlDataType dataType = DataTypeMap[column.DataType]; + + // For string columns, check for user-specified type override + // + if (column.DataType == typeof(string) && outputColumnDataTypes != null) + { + if (outputColumnDataTypes.TryGetValue(column.Name, out var userType)) + { + if (userType != SqlDataType.DotNetChar && userType != SqlDataType.DotNetWChar) + { + throw new ArgumentException( + $"Invalid type override '{userType}' for string column '{column.Name}'. " + + $"Only DotNetChar and DotNetWChar are supported for string columns."); + } + + dataType = userType; + Logging.Trace($"ExtractColumns: Column '{column.Name}' using user-specified type: {dataType}"); + } + } + ulong columnSize = (ulong)DataTypeSize[dataType]; // Add column metadata to a CSharpColumn dictionary @@ -186,12 +209,11 @@ DataFrameColumn column break; case SqlDataType.DotNetWChar: // Calculate column size from actual data. - // columnSize = max character count (UTF-16 byte length / 2). - // Minimum size is 1 character (nchar(0) is illegal in SQL). + // For WCHAR types, column size should be the max byte length. + // Minimum size is 2 bytes (1 UTF-16 character). // int maxUnicodeByteLen = colMap.Length > 0 ? colMap.Where(x => x > 0).DefaultIfEmpty(0).Max() : 0; - int maxCharCount = maxUnicodeByteLen / sizeof(char); - _columns[columnNumber].Size = (ulong)Math.Max(maxCharCount, MinUtf16CharSize); + _columns[columnNumber].Size = (ulong)Math.Max(maxUnicodeByteLen, MinUtf16CharSize); SetDataPtrs(columnNumber, GetUnicodeStringArray(column)); break; diff --git a/language-extensions/dotnet-core-CSharp/src/managed/CSharpSession.cs b/language-extensions/dotnet-core-CSharp/src/managed/CSharpSession.cs index 8d396ed..13d967c 100644 --- a/language-extensions/dotnet-core-CSharp/src/managed/CSharpSession.cs +++ b/language-extensions/dotnet-core-CSharp/src/managed/CSharpSession.cs @@ -193,7 +193,12 @@ public void Execute( { _outputDataSet.ColumnsNumber = (ushort)_outputDataSet.CSharpDataFrame.Columns.Count; - _outputDataSet.ExtractColumns(_outputDataSet.CSharpDataFrame); + // Pass user-specified output column types + // + _outputDataSet.ExtractColumns( + _outputDataSet.CSharpDataFrame, + _userDll.UserExecutor.OutputColumnDataTypes); + *outputSchemaColumnsNumber = _outputDataSet.ColumnsNumber; } else diff --git a/language-extensions/dotnet-core-CSharp/src/managed/sdk/AbstractSqlServerExtensionExecutor.cs b/language-extensions/dotnet-core-CSharp/src/managed/sdk/AbstractSqlServerExtensionExecutor.cs index 624f30f..236254c 100644 --- a/language-extensions/dotnet-core-CSharp/src/managed/sdk/AbstractSqlServerExtensionExecutor.cs +++ b/language-extensions/dotnet-core-CSharp/src/managed/sdk/AbstractSqlServerExtensionExecutor.cs @@ -10,6 +10,7 @@ //********************************************************************* using System.Collections.Generic; using Microsoft.Data.Analysis; +using static Microsoft.SqlServer.CSharpExtension.Sql; namespace Microsoft.SqlServer.CSharpExtension.SDK { @@ -23,6 +24,13 @@ public abstract class AbstractSqlServerExtensionExecutor /// public readonly int SQLSERVER_DOTNET_LANG_EXTENSION_V1 = 1; + /// + /// Optional: Specify SQL data types for output columns by name. + /// Use this to output string columns as NVARCHAR (DotNetWChar) instead of the default VARCHAR (DotNetChar). + /// Example: OutputColumnDataTypes["text"] = SqlDataType.DotNetWChar; + /// + public Dictionary OutputColumnDataTypes { get; } = new Dictionary(); + /// /// Default constructor for AbstractSqlServerExtensionExecutor /// diff --git a/language-extensions/dotnet-core-CSharp/test/src/managed/CSharpTestExecutor.cs b/language-extensions/dotnet-core-CSharp/test/src/managed/CSharpTestExecutor.cs index 9abe9ca..f55517a 100644 --- a/language-extensions/dotnet-core-CSharp/test/src/managed/CSharpTestExecutor.cs +++ b/language-extensions/dotnet-core-CSharp/test/src/managed/CSharpTestExecutor.cs @@ -13,6 +13,7 @@ using System.Collections.Generic; using Microsoft.Data.Analysis; using Microsoft.SqlServer.CSharpExtension.SDK; +using static Microsoft.SqlServer.CSharpExtension.Sql; namespace Microsoft.SqlServer.CSharpExtensionTest { @@ -187,4 +188,49 @@ public override DataFrame Execute(DataFrame input, Dictionary s return null; } } + + /// + /// Test executor demonstrating NVARCHAR output support for DataFrame columns. + /// Uses OutputColumnDataTypes to specify that string columns should be NVARCHAR. + /// + public class CSharpTestExecutorNVarcharOutput: AbstractSqlServerExtensionExecutor + { + public override DataFrame Execute(DataFrame input, Dictionary sqlParams){ + Console.WriteLine("Hello .NET Core CSharpExtension!"); + // Specify that output column "text" should be NVARCHAR (UTF-16) + OutputColumnDataTypes["text"] = SqlDataType.DotNetWChar; + + // Return input unchanged - the column type will be NVARCHAR instead of VARCHAR + return input; + } + } + + /// + /// Test executor demonstrating mixed VARCHAR and NVARCHAR output columns. + /// + public class CSharpTestExecutorMixedStringOutput: AbstractSqlServerExtensionExecutor + { + public override DataFrame Execute(DataFrame input, Dictionary sqlParams){ + Console.WriteLine("Hello .NET Core CSharpExtension!"); + // Column "ascii_col" stays VARCHAR (default, no need to specify) + + // Column "unicode_col" should be NVARCHAR (by name) + OutputColumnDataTypes["unicode_col"] = SqlDataType.DotNetWChar; + + return input; + } + } + + /// + /// Test executor for basic pass-through (no NVARCHAR configuration). + /// + public class CSharpTestExecutorPreserveInputTypes: AbstractSqlServerExtensionExecutor + { + public override DataFrame Execute(DataFrame input, Dictionary sqlParams){ + Console.WriteLine("Hello .NET Core CSharpExtension!"); + // No explicit OutputColumnDataTypes configuration. + // All string columns will be VARCHAR (default). + return input; + } + } } diff --git a/language-extensions/dotnet-core-CSharp/test/src/native/CSharpGetResultColumnTests.cpp b/language-extensions/dotnet-core-CSharp/test/src/native/CSharpGetResultColumnTests.cpp index 6724243..5512632 100644 --- a/language-extensions/dotnet-core-CSharp/test/src/native/CSharpGetResultColumnTests.cpp +++ b/language-extensions/dotnet-core-CSharp/test/src/native/CSharpGetResultColumnTests.cpp @@ -373,8 +373,8 @@ namespace ExtensionApiTest // Description: // Test GetResultColumn with an InputDataSet of nvarchar/nchar (Unicode) columns. // Tests nullptr, empty strings, and basic Unicode characters. - // Note: Output columns are returned as SQL_C_CHAR (UTF-8 encoded) regardless - // of input type, since .NET strings are always output as varchar. + // Note: Without explicit OutputColumnDataTypes configuration, string columns + // default to SQL_C_CHAR (VARCHAR) output, regardless of input type. // TEST_F(CSharpExtensionApiTests, GetWStringResultColumnsTest) { @@ -435,38 +435,251 @@ namespace ExtensionApiTest strLen_or_Ind.data(), columnNames); - // C# outputs all string columns as SQL_C_CHAR (UTF-8). + // NVARCHAR input columns are converted to SQL_C_CHAR output by default. + // To preserve NVARCHAR, use OutputColumnDataTypes["columnName"] = SqlDataType.DotNetWChar. // For ASCII strings, UTF-8 byte length == character count. - // We divide by sizeof(wchar_t) to get character count from UTF-16 byte length. // - SQLULEN maxCol1Len = GetMaxLength(strLenOrIndCol1.data(), rowsNumber) / sizeof(wchar_t); - SQLULEN maxCol2Len = GetMaxLength(strLenOrIndCol2.data(), rowsNumber) / sizeof(wchar_t); + SQLULEN maxCol1CharCount = GetMaxLength(strLenOrIndCol1.data(), rowsNumber) / sizeof(wchar_t); + SQLULEN maxCol2CharCount = GetMaxLength(strLenOrIndCol2.data(), rowsNumber) / sizeof(wchar_t); - // Output is SQL_C_CHAR (UTF-8), column size is max UTF-8 byte length - // For ASCII strings, this equals the character count + // Output is SQL_C_CHAR (VARCHAR default), column size is max character count // GetResultColumn( 0, // columnNumber - SQL_C_CHAR, // dataType (UTF-8 output) - maxCol1Len, // columnSize + SQL_C_CHAR, // dataType (default VARCHAR) + maxCol1CharCount, // columnSize (character count) 0, // decimalDigits SQL_NO_NULLS); // nullable GetResultColumn( 1, // columnNumber - SQL_C_CHAR, // dataType (UTF-8 output) - maxCol2Len, // columnSize + SQL_C_CHAR, // dataType (default VARCHAR) + maxCol2CharCount, // columnSize (character count) 0, // decimalDigits SQL_NULLABLE); // nullable GetResultColumn( 2, // columnNumber - SQL_C_CHAR, // dataType (UTF-8 output) - sizeof(SQLCHAR), // columnSize (1 for null column) + SQL_C_CHAR, // dataType (default VARCHAR) + 1, // columnSize (minimum 1 for null column) 0, // decimalDigits SQL_NULLABLE); // nullable } + //---------------------------------------------------------------------------------------------- + // Name: GetNVarcharOutputResultColumnsTest + // + // Description: + // Test GetResultColumn with an InputDataSet of nvarchar columns where the executor + // explicitly specifies NVARCHAR output using OutputColumnDataTypes["text"] = SqlDataType.DotNetWChar. + // This verifies that user-specified column metadata is correctly applied. + // + TEST_F(CSharpExtensionApiTests, GetNVarcharOutputResultColumnsTest) + { + // Use the executor that explicitly sets OutputColumnDataTypes["text"] = SqlDataType.DotNetWChar + // + string scriptNVarcharOutput = m_UserLibName + m_Separator + + "Microsoft.SqlServer.CSharpExtensionTest.CSharpTestExecutorNVarcharOutput"; + + InitializeSession( + 1, // inputSchemaColumnsNumber + 0, // parametersNumber + scriptNVarcharOutput); + + string textColumnName = "text"; + InitializeColumn(0, textColumnName, SQL_C_WCHAR, m_CharSize); + + // Input data as UTF-16 (nvarchar) + // + vector wstringCol1{ L"Hello", L"World", L"Test", L"Unicode", L"\x4F60\x597D" }; // Last one is Chinese for "Hello" + int rowsNumber = wstringCol1.size(); + + vector strLenOrIndCol1 = + { static_cast(5 * sizeof(wchar_t)), // "Hello" + static_cast(5 * sizeof(wchar_t)), // "World" + static_cast(4 * sizeof(wchar_t)), // "Test" + static_cast(7 * sizeof(wchar_t)), // "Unicode" + static_cast(2 * sizeof(wchar_t)) }; // Chinese chars + + vector strLen_or_Ind{ strLenOrIndCol1.data() }; + + vector wstringCol1Data = GenerateContiguousData(wstringCol1, strLenOrIndCol1.data()); + + void* dataSet[] = { wstringCol1Data.data() }; + + vector columnNames{ textColumnName }; + + Execute( + rowsNumber, + dataSet, + strLen_or_Ind.data(), + columnNames); + + // With OutputColumnDataTypes["text"] = SqlDataType.DotNetWChar, output should be SQL_C_WCHAR + // Column size is max byte count (matching extension host expectations) + // + SQLULEN maxCol1ByteLen = 7 * sizeof(wchar_t); // "Unicode" is the longest at 7 characters * 2 bytes + + GetResultColumn( + 0, // columnNumber + SQL_C_WCHAR, // dataType (UTF-16 output due to NVarchar setting) + maxCol1ByteLen, // columnSize (byte count) + 0, // decimalDigits + SQL_NO_NULLS); // nullable + } + + //---------------------------------------------------------------------------------------------- + // Name: GetPreserveNVarcharTypeResultColumnsTest + // + // Description: + // Test GetResultColumn where input columns are NVARCHAR but output defaults to VARCHAR + // since no explicit OutputColumnDataTypes configuration is provided. + // Uses CSharpTestExecutorPreserveInputTypes which doesn't set explicit column metadata. + // + TEST_F(CSharpExtensionApiTests, GetPreserveNVarcharTypeResultColumnsTest) + { + // Use the executor that just returns input unchanged without explicit OutputColumnDataTypes config + // + string scriptPreserve = m_UserLibName + m_Separator + + "Microsoft.SqlServer.CSharpExtensionTest.CSharpTestExecutorPreserveInputTypes"; + + InitializeSession( + 2, // inputSchemaColumnsNumber + 0, // parametersNumber + scriptPreserve); + + string nvarcharColumnName = "nvarchar_col"; + string varcharColumnName = "varchar_col"; + InitializeColumn(0, nvarcharColumnName, SQL_C_WCHAR, m_CharSize); // NVARCHAR input + InitializeColumn(1, varcharColumnName, SQL_C_CHAR, m_CharSize); // VARCHAR input + + // Input data + // + vector wstringCol{ L"Hello", L"World", L"Test", L"Data", L"Row5" }; + vector stringCol{ "ASCII1", "ASCII2", "ASCII3", "ASCII4", "ASCII5" }; + int rowsNumber = wstringCol.size(); + + vector strLenOrIndCol1 = + { static_cast(5 * sizeof(wchar_t)), + static_cast(5 * sizeof(wchar_t)), + static_cast(4 * sizeof(wchar_t)), + static_cast(4 * sizeof(wchar_t)), + static_cast(4 * sizeof(wchar_t)) }; + + vector strLenOrIndCol2 = + { 6, 6, 6, 6, 6 }; // All strings are 6 bytes + + vector strLen_or_Ind{ strLenOrIndCol1.data(), strLenOrIndCol2.data() }; + + vector wstringColData = GenerateContiguousData(wstringCol, strLenOrIndCol1.data()); + vector stringColData = GenerateContiguousData(stringCol, strLenOrIndCol2.data()); + + void* dataSet[] = { wstringColData.data(), stringColData.data() }; + + vector columnNames{ nvarcharColumnName, varcharColumnName }; + + Execute( + rowsNumber, + dataSet, + strLen_or_Ind.data(), + columnNames); + + // Column 0: Input was NVARCHAR, but output defaults to SQL_C_CHAR (VARCHAR) + // since no OutputColumnDataTypes configuration is provided. + // + GetResultColumn( + 0, // columnNumber + SQL_C_CHAR, // dataType (default VARCHAR) + 5, // columnSize (max char count) + 0, // decimalDigits + SQL_NO_NULLS); // nullable + + // Column 1: Input was VARCHAR, output should be SQL_C_CHAR + // + GetResultColumn( + 1, // columnNumber + SQL_C_CHAR, // dataType (preserved from input) + 6, // columnSize (max byte length) + 0, // decimalDigits + SQL_NO_NULLS); // nullable + } + + //---------------------------------------------------------------------------------------------- + // Name: GetMixedStringOutputResultColumnsTest + // + // Description: + // Test GetResultColumn with mixed VARCHAR and NVARCHAR output columns. + // Uses CSharpTestExecutorMixedStringOutput which sets OutputColumnDataTypes + // for "unicode_col" only, leaving "ascii_col" as default VARCHAR. + // + TEST_F(CSharpExtensionApiTests, GetMixedStringOutputResultColumnsTest) + { + // Use the executor that sets OutputColumnDataTypes["unicode_col"] = SqlDataType.DotNetWChar + // + string scriptMixed = m_UserLibName + m_Separator + + "Microsoft.SqlServer.CSharpExtensionTest.CSharpTestExecutorMixedStringOutput"; + + InitializeSession( + 2, // inputSchemaColumnsNumber + 0, // parametersNumber + scriptMixed); + + string asciiColumnName = "ascii_col"; + string unicodeColumnName = "unicode_col"; + InitializeColumn(0, asciiColumnName, SQL_C_CHAR, m_CharSize); + InitializeColumn(1, unicodeColumnName, SQL_C_WCHAR, m_CharSize); + + // Input data + // + vector stringCol{ "Hello", "World", "Test", "Data!", "Row05" }; + vector wstringCol{ L"Alpha", L"Beta", L"Gamma", L"Delta", L"Omega" }; + int rowsNumber = stringCol.size(); + + vector strLenOrIndCol1 = + { 5, 5, 4, 5, 5 }; + + vector strLenOrIndCol2 = + { static_cast(5 * sizeof(wchar_t)), + static_cast(4 * sizeof(wchar_t)), + static_cast(5 * sizeof(wchar_t)), + static_cast(5 * sizeof(wchar_t)), + static_cast(5 * sizeof(wchar_t)) }; + + vector strLen_or_Ind{ strLenOrIndCol1.data(), strLenOrIndCol2.data() }; + + vector stringColData = GenerateContiguousData(stringCol, strLenOrIndCol1.data()); + vector wstringColData = GenerateContiguousData(wstringCol, strLenOrIndCol2.data()); + + void* dataSet[] = { stringColData.data(), wstringColData.data() }; + + vector columnNames{ asciiColumnName, unicodeColumnName }; + + Execute( + rowsNumber, + dataSet, + strLen_or_Ind.data(), + columnNames); + + // Column 0 (ascii_col): No OutputColumnDataTypes config -> default SQL_C_CHAR (VARCHAR) + // + GetResultColumn( + 0, // columnNumber + SQL_C_CHAR, // dataType (default VARCHAR) + 5, // columnSize (max byte length) + 0, // decimalDigits + SQL_NO_NULLS); // nullable + + // Column 1 (unicode_col): Explicitly set to DotNetWChar -> SQL_C_WCHAR (NVARCHAR) + // + SQLULEN maxCol2ByteLen = 5 * sizeof(wchar_t); // "Alpha"/"Gamma"/"Delta"/"Omega" are 5 chars + GetResultColumn( + 1, // columnNumber + SQL_C_WCHAR, // dataType (NVARCHAR due to OutputColumnDataTypes) + maxCol2ByteLen, // columnSize (byte count) + 0, // decimalDigits + SQL_NO_NULLS); // nullable + } + //---------------------------------------------------------------------------------------------- // Name: GetResultColumn //