diff --git a/Sources/table/CellType.swift b/Sources/table/CellType.swift index 475d549..ee533f5 100644 --- a/Sources/table/CellType.swift +++ b/Sources/table/CellType.swift @@ -76,7 +76,7 @@ enum CellType { } } - debug("Infered cell types: \(CellType.toString(types))") + debug("Inferred cell types: \(CellType.toString(types))") return types } diff --git a/Sources/table/Expressions.swift b/Sources/table/Expressions.swift index e611e36..3ab884a 100644 --- a/Sources/table/Expressions.swift +++ b/Sources/table/Expressions.swift @@ -140,7 +140,8 @@ class Functions { Random(), RandomChoice(), Prefix(), - Array() + Array(), + Distinct() ] static func find(name: String) -> (any InternalFunction)? { @@ -310,4 +311,25 @@ class Functions { return "array(str) – returns a Cassandra representation of an array with the provided elements. Requires a comma-separated list of arguments or at least a single argument that will be split by commas" } } + + class Distinct: InternalFunction { + var name: String { "distinct" } + + func validate(header: Header?, arguments: [any FormatExpr]) throws { + if arguments.count != 1 { + throw RuntimeError("Function \(name) requires exactly one argument") + } + } + + func apply(row: Row, arguments: [any FormatExpr]) throws -> String { + let arguments = try arguments.map { try $0.fill(row: row) } + let elements = arguments.count > 1 ? arguments : arguments[0].split(separator: Character(",")).map { String($0).trimmingCharacters(in: .whitespaces) } + + return Set(elements).joined(separator: ",") + } + + var description: String { + return "distinct(str) – returns a distinct element from a comma separated list of elements. Requires a single argument that will be split by commas" + } + } } \ No newline at end of file diff --git a/Sources/table/Extensions.swift b/Sources/table/Extensions.swift index 60506f6..696074b 100644 --- a/Sources/table/Extensions.swift +++ b/Sources/table/Extensions.swift @@ -50,9 +50,9 @@ extension Array { } extension Table { - func memoized() -> any Table { + func memoized() -> InMemoryTableView { if self is InMemoryTableView { - return self + return self as! InMemoryTableView } else { return InMemoryTableView(table: self) } diff --git a/Sources/table/MainApp.swift b/Sources/table/MainApp.swift index 45132c8..84f2931 100644 --- a/Sources/table/MainApp.swift +++ b/Sources/table/MainApp.swift @@ -125,6 +125,12 @@ struct MainApp: AsyncParsableCommand { @Option(name: .customLong("distinct"), help: "Returns only distinct values for the specified column set. Example: --distinct name,city_id.") var distinctColumns: [String] = [] + @Option(name: .customLong("duplicate"), help: "Outputs only duplicate rows by the specified columns. Example: --duplicates name,city_id will find duplicates by both name and city_id columns.") + var duplicateColumns: [String] = [] + + @Option(name: .customLong("group-by"), help: "Groups rows by the specified columns. Example: --group-by city_id,region.") + var groupBy: [String] = [] + @Option(name: .customLong("join"), help: "Speficies a second file path to join with the current one. Joining column is the first one for both tables or can be specified by the --on option.") var joinFile: String? @@ -192,6 +198,16 @@ struct MainApp: AsyncParsableCommand { table = DistinctTableView(table: table, distinctColumns: distinctColumns) } + if !duplicateColumns.isEmpty { + try duplicateColumns.forEach { if table.header.index(ofColumn: $0) == nil { throw RuntimeError("Column \($0) in distinct clause is not found in the table") } } + table = DuplicateTableView(table: table, duplicateColumns: duplicateColumns) + } + + if !groupBy.isEmpty { + try groupBy.forEach { if table.header.index(ofColumn: $0) == nil { throw RuntimeError("Column \($0) in group-by clause is not found in the table") } } + table = GroupedTableView(table: table, groupBy: groupBy) + } + let formatOpt = try printFormat.map { try Format(format: $0).validated(header: table.header) } if let sortColumns { diff --git a/Sources/table/Table.swift b/Sources/table/Table.swift index 24e8d9e..f923c0f 100644 --- a/Sources/table/Table.swift +++ b/Sources/table/Table.swift @@ -126,12 +126,12 @@ class ParsedTable: Table { if let row = reader.readLine() { if row.matches(ParsedTable.ownHeaderPattern) { debug("Detected tool own table format") - let parsedHeader = try reader.readLine().map { - try! Header(data: $0, delimeter: "│", trim: true, hasOuterBorders: true) - }.orThrow(RuntimeError("Failed to parse own table header")) + // Note the use of long pipe │ instead of short one | + let parsedHeader = try reader.readLine().map { try! Header(data: $0, delimeter: "│", trim: true, hasOuterBorders: true) }.orThrow(RuntimeError("Failed to parse own table header")) - let dataRows = [reader.readLine(), reader.readLine(), reader.readLine()].compactMap{$0}.filter { !ParsedTable.technicalRow($0) } - let types = userTypes ?? CellType.infer(rows: dataRows.map { try! ParsedTable.readRowComponents($0, type: .cassandraSql, delimeter: "|", trim: true) }) + let dataRows = [reader.readLine(), reader.readLine(), reader.readLine(), reader.readLine()].compactMap{$0}.filter { !ParsedTable.technicalRow($0) } + + let types = userTypes ?? CellType.infer(rows: dataRows.map { try! ParsedTable.readRowComponents($0, type: .table, delimeter: "│", trim: true) }) let header = (headerOverride ?? parsedHeader).withTypes(types) return (TableConfig(header: header, type: FileType.table, delimeter: "│", trim: true), dataRows) @@ -196,7 +196,7 @@ class ParsedTable: Table { } } - private static func readRowComponents(_ row: String, type: FileType, delimeter: String, trim: Bool) throws -> [String] { + private static func readRowComponents(_ row: String, type: FileType, delimeter: String, trim: Bool) throws -> [String] { if type == .csv { return try! Csv.parseLine(row, delimeter: delimeter) } diff --git a/Sources/table/TableView.swift b/Sources/table/TableView.swift index 23941b7..731c39f 100644 --- a/Sources/table/TableView.swift +++ b/Sources/table/TableView.swift @@ -130,6 +130,105 @@ class DistinctTableView: Table { } } +/** Table view with filtered rows to allow only duplicate values for certain columns */ +class DuplicateTableView: Table { + var table: InMemoryTableView + let duplicateColumns: [String] + let header: Header + + private var entriesCount: Dictionary<[String], Int> = [:] + + init(table: any Table, duplicateColumns: [String]) { + self.table = table.memoized() + self.duplicateColumns = duplicateColumns + self.header = table.header + countEntries() + } + + func next() throws -> Row? { + var row = table.next() + + while let curRow = row { + let values = duplicateColumns.map { col in curRow[col] ?? "" } + + if entriesCount[values] != nil { + return curRow + } + + row = table.next() + } + + return nil + } + + private func countEntries() { + var count = 0 + + while let row = table.next() { + let values = duplicateColumns.map { col in row[col] ?? "" } + entriesCount[values, default: 0] += 1 + count += 1 + } + + // Filter out only duplicates + entriesCount = entriesCount.filter { $0.value > 1 } + + debug("DuplicateTableView: Processed \(count) rows. Found \(entriesCount.count) duplicate entries for columns: \(duplicateColumns.joined(separator: ", "))") + + // Reset the cursor to the beginning + table.rewind() + } +} + +class GroupedTableView: Table { + var table: any Table + let groupBy: [String] + let header: Header + private var idx = -1 + + private var groupIterator: Dictionary<[String], [Row]>.Iterator? + + init(table: any Table, groupBy: [String]) { + self.table = table + self.groupBy = groupBy + self.header = table.header + groupIterator = loadGroups() + } + + func next() throws -> Row? { + if let entry = groupIterator!.next() { + let groupKey = entry.key + let group = entry.value + + // Create a new row with the group key as the first columns + let components = header.components().map { name in + if let index = groupBy.firstIndex(of: name) { + return groupKey[index] + } else { + return group.map { $0[name] ?? "" }.joined(separator: ", ") + } + } + + idx += 1 + return Row(header: header, index: idx, components: components) + } else { + return nil + } + } + + private func loadGroups() -> Dictionary<[String], [Row]>.Iterator { + // TODO: make an ordered collection + var groups: [[String]: [Row]] = [:] + + while let row = try? table.next() { + let key = groupBy.map { row[$0] ?? "" } + groups[key, default: []].append(row) + } + + return groups.makeIterator() + } +} + /** Table view that have randomized sample of the rows. */ class SampledTableView: Table { var table: any Table @@ -213,6 +312,8 @@ class InMemoryTableView: InMemoryTable { } func next() -> Row? { + if(!loaded) { try? load() } + if cursor < rows.count { let row = rows[cursor] cursor += 1