Skip to content

Commit 5ce5acb

Browse files
Enhance smart generation and profiling with stable column ordering (#31)
- Introduced column order preservation in smart generation processes, ensuring that generated fields maintain their DDL order. - Added functionality to hydrate column order from ClickHouse, improving the stability of column arrangements during profiling. - Updated serialization and deserialization processes to include column order, allowing for consistent handling of column arrangements across different operations. - Implemented cache invalidation for user sessions post-model generation to ensure immediate updates in Explore/Meta requests. This update improves the reliability and predictability of column handling in generated models.
1 parent fe688ee commit 5ce5acb

6 files changed

Lines changed: 201 additions & 7 deletions

File tree

services/actions/src/rpc/smartGenSchemas.js

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import apiError from "../utils/apiError.js";
22
import cubejsApi from "../utils/cubejsApi.js";
3+
import { invalidateUserCache } from "../utils/cubeCache.js";
34

45
export default async (session, input, headers) => {
56
const {
@@ -44,6 +45,12 @@ export default async (session, input, headers) => {
4445
selected_columns: selectedColumns,
4546
});
4647

48+
// Ensure subsequent Explore/Meta requests resolve the latest branch version
49+
// immediately after model generation instead of waiting for cache TTL expiry.
50+
if (userId) {
51+
invalidateUserCache(userId);
52+
}
53+
4754
return result;
4855
} catch (err) {
4956
return apiError(err);

services/cubejs/src/routes/smartGenerate.js

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,59 @@ import { deserializeProfile } from '../utils/smart-generation/profileSerializer.
1414
import { diffModels, parseCubesFromJs } from '../utils/smart-generation/diffModels.js';
1515
import { validateModelSyntax, smokeTestQuery } from '../utils/smart-generation/modelValidator.js';
1616

17+
function reorderProfileColumns(profiledTable) {
18+
if (!profiledTable?.columns || !(profiledTable.columns instanceof Map)) return profiledTable;
19+
const ordered = new Map();
20+
const seen = new Set();
21+
const preferred = Array.isArray(profiledTable.columnOrder) ? profiledTable.columnOrder : [];
22+
23+
for (const colName of preferred) {
24+
if (profiledTable.columns.has(colName)) {
25+
ordered.set(colName, profiledTable.columns.get(colName));
26+
seen.add(colName);
27+
}
28+
}
29+
30+
for (const [colName, colData] of profiledTable.columns) {
31+
if (seen.has(colName)) continue;
32+
ordered.set(colName, colData);
33+
preferred.push(colName);
34+
}
35+
36+
return {
37+
...profiledTable,
38+
columns: ordered,
39+
columnOrder: preferred,
40+
};
41+
}
42+
43+
async function hydrateColumnOrderFromClickHouse(driver, profiledTable, schema, table) {
44+
if (!driver || !profiledTable?.columns || !(profiledTable.columns instanceof Map)) {
45+
return profiledTable;
46+
}
47+
const hasStableOrder =
48+
Array.isArray(profiledTable.columnOrder)
49+
&& profiledTable.columnOrder.length > 0
50+
&& profiledTable.columnOrder.length >= profiledTable.columns.size;
51+
if (hasStableOrder) {
52+
return reorderProfileColumns(profiledTable);
53+
}
54+
55+
try {
56+
const rows = await driver.query(
57+
`SELECT name FROM system.columns WHERE database = '${schema}' AND table = '${table}' ORDER BY position`
58+
);
59+
const ddlOrder = rows.map((r) => r.name).filter((name) => profiledTable.columns.has(name));
60+
if (ddlOrder.length > 0) {
61+
return reorderProfileColumns({ ...profiledTable, columnOrder: ddlOrder });
62+
}
63+
} catch (err) {
64+
console.warn(`[smartGenerate] Column order hydration failed (non-fatal): ${err.message}`);
65+
}
66+
67+
return reorderProfileColumns(profiledTable);
68+
}
69+
1770
export default async (req, res, cubejs) => {
1871
const { securityContext } = req;
1972
const {
@@ -75,6 +128,9 @@ export default async (req, res, cubejs) => {
75128
const deserialized = deserializeProfile(profileData);
76129
profiledTable = deserialized.profiledTable;
77130
primaryKeys = deserialized.primaryKeys;
131+
// Ensure column order is stable even when profile_data transport reorders object keys.
132+
driver = await cubejs.options.driverFactory({ securityContext });
133+
profiledTable = await hydrateColumnOrderFromClickHouse(driver, profiledTable, schema, table);
78134
} else {
79135
// Legacy path: profile from scratch (two ClickHouse round-trips)
80136
driver = await cubejs.options.driverFactory({ securityContext });
@@ -89,6 +145,7 @@ export default async (req, res, cubejs) => {
89145

90146
emitter.emit('primary_keys', 'Detecting primary keys...', 0.5);
91147
primaryKeys = await detectPrimaryKeys(driver, schema, table);
148+
profiledTable = reorderProfileColumns(profiledTable);
92149
}
93150

94151
// Filter columns if user selected a subset
@@ -100,7 +157,11 @@ export default async (req, res, cubejs) => {
100157
filtered.set(name, data);
101158
}
102159
}
103-
profiledTable = { ...profiledTable, columns: filtered };
160+
const existingOrder = Array.isArray(profiledTable.columnOrder) ? profiledTable.columnOrder : [];
161+
const filteredOrder = existingOrder.length > 0
162+
? existingOrder.filter((name) => selectedColumns.has(name))
163+
: Array.from(filtered.keys());
164+
profiledTable = { ...profiledTable, columns: filtered, columnOrder: filteredOrder };
104165
}
105166

106167
// Build cubes

services/cubejs/src/utils/smart-generation/cubeBuilder.js

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,14 @@ function isInt8Boolean(rawType, profile) {
285285
* @returns {{ dimensions: object[], measures: object[], mapKeysDiscovered: number, columnsProfiled: number, columnsSkipped: number }}
286286
*/
287287
function processColumns(columns, options) {
288-
const { arrayJoinColumns = [], maxMapKeys = 500, primaryKeys = [], cubeName = 'cube', columnDescriptions = new Map() } = options;
288+
const {
289+
arrayJoinColumns = [],
290+
maxMapKeys = 500,
291+
primaryKeys = [],
292+
cubeName = 'cube',
293+
columnDescriptions = new Map(),
294+
columnOrder = [],
295+
} = options;
289296
const arrayJoinColumnNames = arrayJoinColumns.map((a) => a.column);
290297

291298
const allFields = [];
@@ -562,6 +569,37 @@ function processColumns(columns, options) {
562569
// Deduplicate field names
563570
deduplicateFields(allFields);
564571

572+
// Final ordering guard: keep generated fields in DDL column order.
573+
// This protects against accidental ordering drift in upstream payloads.
574+
const columnIndex = new Map();
575+
if (Array.isArray(columnOrder) && columnOrder.length > 0) {
576+
for (let i = 0; i < columnOrder.length; i++) {
577+
columnIndex.set(columnOrder[i], i);
578+
}
579+
} else {
580+
let idx = 0;
581+
for (const colName of columns.keys()) {
582+
columnIndex.set(colName, idx++);
583+
}
584+
}
585+
586+
const fallbackIndex = Number.MAX_SAFE_INTEGER;
587+
allFields
588+
.map((field, idx) => ({
589+
field,
590+
idx,
591+
order: columnIndex.has(field._sourceColumn)
592+
? columnIndex.get(field._sourceColumn)
593+
: fallbackIndex,
594+
}))
595+
.sort((a, b) => {
596+
if (a.order === b.order) return a.idx - b.idx;
597+
return a.order - b.order;
598+
})
599+
.forEach((entry, i) => {
600+
allFields[i] = entry.field;
601+
});
602+
565603
const dimensions = [];
566604
const measures = [];
567605

@@ -619,6 +657,7 @@ function buildRawCube(profiledTable, options) {
619657
maxMapKeys,
620658
primaryKeys,
621659
cubeName,
660+
columnOrder: profiledTable.columnOrder || [],
622661
columnDescriptions: profiledTable.columnDescriptions || new Map(),
623662
});
624663

services/cubejs/src/utils/smart-generation/merger.js

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,27 @@ function mergeFields(existingFields, newFields, keepStale) {
209209
}
210210
}
211211

212-
return merged;
212+
// Preserve DDL-based ordering from the incoming generation for all fields
213+
// that exist in the incoming set. Fields not present in incoming (e.g. stale
214+
// auto fields, preserved user/AI fields) keep relative order and are placed after.
215+
const incomingOrder = new Map();
216+
for (let i = 0; i < incoming.length; i++) {
217+
incomingOrder.set(incoming[i].name, i);
218+
}
219+
220+
return merged
221+
.map((field, idx) => ({
222+
field,
223+
idx,
224+
order: incomingOrder.has(field.name)
225+
? incomingOrder.get(field.name)
226+
: Number.MAX_SAFE_INTEGER,
227+
}))
228+
.sort((a, b) => {
229+
if (a.order === b.order) return a.idx - b.idx;
230+
return a.order - b.order;
231+
})
232+
.map((entry) => entry.field);
213233
}
214234

215235
// ---------------------------------------------------------------------------

services/cubejs/src/utils/smart-generation/profileSerializer.js

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,34 @@ export function serializeProfile(profiledTable, primaryKeys) {
2121
}
2222

2323
const columns = {};
24+
const columnOrder = [];
25+
const preferredOrder = Array.isArray(profiledTable.columnOrder) ? profiledTable.columnOrder : null;
26+
27+
if (preferredOrder && profiledTable.columns && typeof profiledTable.columns === 'object') {
28+
for (const key of preferredOrder) {
29+
const value = profiledTable.columns instanceof Map
30+
? profiledTable.columns.get(key)
31+
: profiledTable.columns[key];
32+
if (value !== undefined) {
33+
columns[key] = value;
34+
columnOrder.push(key);
35+
}
36+
}
37+
}
38+
2439
if (profiledTable.columns instanceof Map) {
2540
for (const [key, value] of profiledTable.columns) {
41+
if (Object.prototype.hasOwnProperty.call(columns, key)) continue;
2642
columns[key] = value;
43+
columnOrder.push(key);
2744
}
2845
} else if (profiledTable.columns && typeof profiledTable.columns === 'object') {
2946
// Already a plain object — pass through
30-
Object.assign(columns, profiledTable.columns);
47+
for (const [key, value] of Object.entries(profiledTable.columns)) {
48+
if (Object.prototype.hasOwnProperty.call(columns, key)) continue;
49+
columns[key] = value;
50+
columnOrder.push(key);
51+
}
3152
}
3253

3354
const columnDescriptions = {};
@@ -47,6 +68,7 @@ export function serializeProfile(profiledTable, primaryKeys) {
4768
sampled: profiledTable.sampled,
4869
sample_size: profiledTable.sample_size,
4970
columns,
71+
columnOrder,
5072
columnDescriptions,
5173
},
5274
primaryKeys: primaryKeys || [],
@@ -68,7 +90,20 @@ export function deserializeProfile(serialized) {
6890

6991
const columns = new Map();
7092
if (src.columns && typeof src.columns === 'object') {
93+
const preferredOrder = Array.isArray(src.columnOrder) ? src.columnOrder : [];
94+
const seen = new Set();
95+
96+
// Reconstruct in preserved DDL order first.
97+
for (const key of preferredOrder) {
98+
if (Object.prototype.hasOwnProperty.call(src.columns, key)) {
99+
columns.set(key, src.columns[key]);
100+
seen.add(key);
101+
}
102+
}
103+
104+
// Include any unexpected keys not present in columnOrder.
71105
for (const [key, value] of Object.entries(src.columns)) {
106+
if (seen.has(key)) continue;
72107
columns.set(key, value);
73108
}
74109
}
@@ -88,6 +123,7 @@ export function deserializeProfile(serialized) {
88123
sampled: src.sampled,
89124
sample_size: src.sample_size,
90125
columns,
126+
columnOrder: Array.isArray(src.columnOrder) ? src.columnOrder : undefined,
91127
columnDescriptions,
92128
},
93129
primaryKeys: serialized.primaryKeys || [],

services/cubejs/src/utils/smart-generation/profiler.js

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -483,11 +483,20 @@ export async function profileTable(driver, schema, table, options = {}) {
483483
return [];
484484
});
485485

486-
const [metaResult, describeRows, tableCommentRows, columnCommentRows] = await Promise.all([
486+
// Fetch DDL column positions from system.columns to preserve true table order
487+
const columnOrderPromise = driver.query(
488+
`SELECT name, position FROM system.columns WHERE database = '${schema}' AND table = '${table}' ORDER BY position`
489+
).catch(err => {
490+
console.warn(`[profiler] Column order fetch failed (non-fatal): ${err.message}`);
491+
return [];
492+
});
493+
494+
const [metaResult, describeRows, tableCommentRows, columnCommentRows, columnOrderRows] = await Promise.all([
487495
metaPromise,
488496
driver.query(`DESCRIBE TABLE ${schema}.\`${table}\``),
489497
tableCommentPromise,
490498
columnCommentsPromise,
499+
columnOrderPromise,
491500
]);
492501

493502
// Build column descriptions map
@@ -514,9 +523,29 @@ export async function profileTable(driver, schema, table, options = {}) {
514523
tracker.emit('init', `Found ${emptyColumns.size} columns with zero bytes — will skip`, { empty_columns: emptyColumns.size });
515524
}
516525

517-
// Build columns map from DESCRIBE
526+
// Build a stable DDL order map from system.columns.position
527+
const ddlPositionByName = new Map();
528+
for (const row of columnOrderRows) {
529+
if (row?.name != null && row?.position != null) {
530+
ddlPositionByName.set(row.name, Number(row.position));
531+
}
532+
}
533+
534+
// Order DESCRIBE rows by DDL position when available (fallback: original order)
535+
const describeRowsWithIndex = describeRows.map((row, index) => ({ row, index }));
536+
describeRowsWithIndex.sort((a, b) => {
537+
const aPos = ddlPositionByName.get(a.row.name);
538+
const bPos = ddlPositionByName.get(b.row.name);
539+
const aOrder = Number.isFinite(aPos) ? aPos : Number.MAX_SAFE_INTEGER;
540+
const bOrder = Number.isFinite(bPos) ? bPos : Number.MAX_SAFE_INTEGER;
541+
if (aOrder !== bOrder) return aOrder - bOrder;
542+
return a.index - b.index;
543+
});
544+
const columnOrder = describeRowsWithIndex.map(({ row }) => row.name);
545+
546+
// Build columns map from ordered DESCRIBE rows
518547
const columns = new Map();
519-
for (const row of describeRows) {
548+
for (const { row } of describeRowsWithIndex) {
520549
const colName = row.name;
521550
const parsed = parseType(row.type, colName);
522551
columns.set(colName, {
@@ -625,6 +654,7 @@ export async function profileTable(driver, schema, table, options = {}) {
625654
sample_size: null,
626655
sampling_method: 'none',
627656
columns,
657+
columnOrder,
628658
tableDescription: null,
629659
columnDescriptions: new Map(),
630660
filters: normalizedFilters,
@@ -1023,6 +1053,7 @@ export async function profileTable(driver, schema, table, options = {}) {
10231053
sample_size: needsSampling ? Math.min(Math.round(rowCount / SAMPLE_RATIO), SUBQUERY_LIMIT_MAX) : null,
10241054
sampling_method: needsSampling ? 'subquery_limit' : 'none',
10251055
columns,
1056+
columnOrder,
10261057
tableDescription,
10271058
columnDescriptions,
10281059
filters: normalizedFilters.length > 0 ? normalizedFilters : undefined,

0 commit comments

Comments
 (0)