-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathcutcols
More file actions
executable file
·100 lines (86 loc) · 2.22 KB
/
cutcols
File metadata and controls
executable file
·100 lines (86 loc) · 2.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/usr/bin/bash
if [ -z $2 ]; then
echo "Usage: cutcols [col1] [col2] [...] [filename]"
echo
echo "Extracts columns from a delimited text file by case insensitive header names "
echo
exit
fi
# Last arg is filename; others are requested column names
infile="${!#}"
fileroot=$(echo "${infile}" | sed 's/\.....\?$//')
outfile="${fileroot}_fixed.tsv"
# Collect column names (all args except last)
COLS=("$@")
unset 'COLS[${#COLS[@]}-1]' # remove filename element
awk -v FS="\t" -v OFS="\t" -v want_cols="$(printf "%s\n" "${COLS[@]}")" -v outfile="${outfile}" '
BEGIN {
n_want = 0
# Load requested column names
split(want_cols, lines, /\n/)
for (i in lines) {
if (length(lines[i]) == 0) continue
++n_want
wname = lines[i]
want_order[n_want] = wname
key = tolower(wname)
want[key] = n_want # store position in request order
}
}
NR == 1 {
# Map header names to indices
for (i = 1; i <= NF; i++) {
header = $i
key = tolower(header)
h2idx[key] = i
original[key] = header
}
# Resolve requested columns to indices; warn for missing
n_out = 0
for (k = 1; k <= n_want; k++) {
wname = want_order[k]
key = tolower(wname)
if (key in h2idx) {
++n_out
out_idx[n_out] = h2idx[key]
out_name[n_out] = original[key]
} else {
missing[++missing_n] = wname
}
}
# Emit warnings for any missing columns to stderr
if (missing_n > 0) {
for (m = 1; m <= missing_n; m++) {
printf("Warning: column not found: %s\n", missing[m]) > "/dev/stderr"
}
}
# If none found, exit non-zero
if (n_out == 0) {
print "Error: none of the requested columns were found in header." > "/dev/stderr"
exit 3
}
# Print header row for selected columns
for (j = 1; j <= n_out; j++) {
sel[j] = out_idx[j]
hdr[j] = out_name[j]
}
print_row = ""
for (j = 1; j <= n_out; j++) {
print_row = (j == 1) ? hdr[j] : print_row OFS hdr[j]
}
print print_row > outfile
next
}
# Print data rows
{
print_row = ""
for (j = 1; j <= n_out; j++) {
val = (sel[j] <= NF) ? $(sel[j]) : ""
print_row = (j == 1) ? val : print_row OFS val
}
print print_row > outfile
}
' "${infile}"
echo
echo "The desired columns were extracted to ${outfile}"
echo