Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Sources/table/CellType.swift
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ enum CellType {
}
}

debug("Infered cell types: \(CellType.toString(types))")
debug("Inferred cell types: \(CellType.toString(types))")

return types
}
Expand Down
24 changes: 23 additions & 1 deletion Sources/table/Expressions.swift
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,8 @@ class Functions {
Random(),
RandomChoice(),
Prefix(),
Array()
Array(),
Distinct()
]

static func find(name: String) -> (any InternalFunction)? {
Expand Down Expand Up @@ -310,4 +311,25 @@ class Functions {
return "array(str) – returns a Cassandra representation of an array with the provided elements. Requires a comma-separated list of arguments or at least a single argument that will be split by commas"
}
}

class Distinct: InternalFunction {
var name: String { "distinct" }

func validate(header: Header?, arguments: [any FormatExpr]) throws {
if arguments.count != 1 {
throw RuntimeError("Function \(name) requires exactly one argument")
}
}

func apply(row: Row, arguments: [any FormatExpr]) throws -> String {
let arguments = try arguments.map { try $0.fill(row: row) }
let elements = arguments.count > 1 ? arguments : arguments[0].split(separator: Character(",")).map { String($0).trimmingCharacters(in: .whitespaces) }

return Set(elements).joined(separator: ",")
}

var description: String {
return "distinct(str) – returns a distinct element from a comma separated list of elements. Requires a single argument that will be split by commas"
}
}
}
4 changes: 2 additions & 2 deletions Sources/table/Extensions.swift
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@ extension Array {
}

extension Table {
func memoized() -> any Table {
func memoized() -> InMemoryTableView {
if self is InMemoryTableView {
return self
return self as! InMemoryTableView
} else {
return InMemoryTableView(table: self)
}
Expand Down
16 changes: 16 additions & 0 deletions Sources/table/MainApp.swift
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,12 @@ struct MainApp: AsyncParsableCommand {
@Option(name: .customLong("distinct"), help: "Returns only distinct values for the specified column set. Example: --distinct name,city_id.")
var distinctColumns: [String] = []

@Option(name: .customLong("duplicate"), help: "Outputs only duplicate rows by the specified columns. Example: --duplicates name,city_id will find duplicates by both name and city_id columns.")
var duplicateColumns: [String] = []

@Option(name: .customLong("group-by"), help: "Groups rows by the specified columns. Example: --group-by city_id,region.")
var groupBy: [String] = []

@Option(name: .customLong("join"), help: "Speficies a second file path to join with the current one. Joining column is the first one for both tables or can be specified by the --on option.")
var joinFile: String?

Expand Down Expand Up @@ -192,6 +198,16 @@ struct MainApp: AsyncParsableCommand {
table = DistinctTableView(table: table, distinctColumns: distinctColumns)
}

if !duplicateColumns.isEmpty {
try duplicateColumns.forEach { if table.header.index(ofColumn: $0) == nil { throw RuntimeError("Column \($0) in distinct clause is not found in the table") } }
table = DuplicateTableView(table: table, duplicateColumns: duplicateColumns)
}

if !groupBy.isEmpty {
try groupBy.forEach { if table.header.index(ofColumn: $0) == nil { throw RuntimeError("Column \($0) in group-by clause is not found in the table") } }
table = GroupedTableView(table: table, groupBy: groupBy)
}

let formatOpt = try printFormat.map { try Format(format: $0).validated(header: table.header) }

if let sortColumns {
Expand Down
12 changes: 6 additions & 6 deletions Sources/table/Table.swift
Original file line number Diff line number Diff line change
Expand Up @@ -126,12 +126,12 @@ class ParsedTable: Table {
if let row = reader.readLine() {
if row.matches(ParsedTable.ownHeaderPattern) {
debug("Detected tool own table format")
let parsedHeader = try reader.readLine().map {
try! Header(data: $0, delimeter: "│", trim: true, hasOuterBorders: true)
}.orThrow(RuntimeError("Failed to parse own table header"))
// Note the use of long pipe │ instead of short one |
let parsedHeader = try reader.readLine().map { try! Header(data: $0, delimeter: "│", trim: true, hasOuterBorders: true) }.orThrow(RuntimeError("Failed to parse own table header"))

let dataRows = [reader.readLine(), reader.readLine(), reader.readLine()].compactMap{$0}.filter { !ParsedTable.technicalRow($0) }
let types = userTypes ?? CellType.infer(rows: dataRows.map { try! ParsedTable.readRowComponents($0, type: .cassandraSql, delimeter: "|", trim: true) })
let dataRows = [reader.readLine(), reader.readLine(), reader.readLine(), reader.readLine()].compactMap{$0}.filter { !ParsedTable.technicalRow($0) }

let types = userTypes ?? CellType.infer(rows: dataRows.map { try! ParsedTable.readRowComponents($0, type: .table, delimeter: "│", trim: true) })
let header = (headerOverride ?? parsedHeader).withTypes(types)

return (TableConfig(header: header, type: FileType.table, delimeter: "│", trim: true), dataRows)
Expand Down Expand Up @@ -196,7 +196,7 @@ class ParsedTable: Table {
}
}

private static func readRowComponents(_ row: String, type: FileType, delimeter: String, trim: Bool) throws -> [String] {
private static func readRowComponents(_ row: String, type: FileType, delimeter: String, trim: Bool) throws -> [String] {
if type == .csv {
return try! Csv.parseLine(row, delimeter: delimeter)
}
Expand Down
101 changes: 101 additions & 0 deletions Sources/table/TableView.swift
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,105 @@ class DistinctTableView: Table {
}
}

/** Table view with filtered rows to allow only duplicate values for certain columns */
class DuplicateTableView: Table {
var table: InMemoryTableView
let duplicateColumns: [String]
let header: Header

private var entriesCount: Dictionary<[String], Int> = [:]

init(table: any Table, duplicateColumns: [String]) {
self.table = table.memoized()
self.duplicateColumns = duplicateColumns
self.header = table.header
countEntries()
}

func next() throws -> Row? {
var row = table.next()

while let curRow = row {
let values = duplicateColumns.map { col in curRow[col] ?? "" }

if entriesCount[values] != nil {
return curRow
}

row = table.next()
}

return nil
}

private func countEntries() {
var count = 0

while let row = table.next() {
let values = duplicateColumns.map { col in row[col] ?? "" }
entriesCount[values, default: 0] += 1
count += 1
}

// Filter out only duplicates
entriesCount = entriesCount.filter { $0.value > 1 }

debug("DuplicateTableView: Processed \(count) rows. Found \(entriesCount.count) duplicate entries for columns: \(duplicateColumns.joined(separator: ", "))")

// Reset the cursor to the beginning
table.rewind()
}
}

class GroupedTableView: Table {
var table: any Table
let groupBy: [String]
let header: Header
private var idx = -1

private var groupIterator: Dictionary<[String], [Row]>.Iterator?

init(table: any Table, groupBy: [String]) {
self.table = table
self.groupBy = groupBy
self.header = table.header
groupIterator = loadGroups()
}

func next() throws -> Row? {
if let entry = groupIterator!.next() {
let groupKey = entry.key
let group = entry.value

// Create a new row with the group key as the first columns
let components = header.components().map { name in
if let index = groupBy.firstIndex(of: name) {
return groupKey[index]
} else {
return group.map { $0[name] ?? "" }.joined(separator: ", ")
}
}

idx += 1
return Row(header: header, index: idx, components: components)
} else {
return nil
}
}

private func loadGroups() -> Dictionary<[String], [Row]>.Iterator {
// TODO: make an ordered collection
var groups: [[String]: [Row]] = [:]

while let row = try? table.next() {
let key = groupBy.map { row[$0] ?? "" }
groups[key, default: []].append(row)
}

return groups.makeIterator()
}
}

/** Table view that have randomized sample of the rows. */
class SampledTableView: Table {
var table: any Table
Expand Down Expand Up @@ -213,6 +312,8 @@ class InMemoryTableView: InMemoryTable {
}

func next() -> Row? {
if(!loaded) { try? load() }

if cursor < rows.count {
let row = rows[cursor]
cursor += 1
Expand Down
Loading