-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathaddresses-fix
More file actions
executable file
·88 lines (73 loc) · 2.18 KB
/
addresses-fix
File metadata and controls
executable file
·88 lines (73 loc) · 2.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/awk -f
BEGIN{FS=OFS="\t"
addr1Label="ADDRESS1"
addr2Label="ADDRESS2"
outfile="outfile_fixed.tsv"
zipregex1 = ".*([0-9]{5}(-[0-9]{4})?$)" # US zip codes
zipregex2 = ".*([A-Z][0-9][A-Z] [0-9][A-Z][0-9]$)" # Canadian codes
stateregex = "^ *([A-Z][A-Z])[ $].*$" # US states
}
function fixaddr(address)
{
addr1=addr2=city=state=zip=""
zipindex=stateindex=cityindex=0
arrlen=gsub(/\$/, ",", address)
arrlen = split(address, addrarr, ",")
if (arrlen > 0){ addr1 = addrarr[1] }
for(addfields=1;addfields<=arrlen;addfields++) {
if (addrarr[addfields] ~ zipregex1) {
zip = gensub(zipregex1, "\\1", 1, addrarr[addfields])
zipindex = addfields
}
if (addrarr[addfields] ~ zipregex2 && zip == "") {
zip = gensub(zipregex2, "\\1", 1, addrarr[addfields])
zipindex = addfields
}
if (addrarr[addfields] ~ stateregex) {
state = gensub(stateregex, "\\1", 1, addrarr[addfields])
stateindex = addfields
}
if (state == "" && zip != "" && addrarr[addfields] ~ /^ [A-Za-z]+ / ) {
state = gensub("^ *([A-Za-z]*) "zip, "\\1", 1, addrarr[addfields])
stateindex = addfields
}
}
if (stateindex == 0 && zipindex > 1){
state = addrarr[zipindex - 1]
stateindex = zipindex - 1
}
if (stateindex > 1){
city = addrarr[stateindex - 1]
cityindex = stateindex - 1
}
if (cityindex == 3) {
addr2 = addrarr[cityindex - 1]
}
if (cityindex == 4) {
addr2 = addrarr[cityindex - 2]", "addrarr[cityindex - 1]
}
if (cityindex == 5) {
addr2 = addrarr[cityindex - 3]", "addrarr[cityindex - 2]", "addrarr[cityindex - 1]
}
return addr1"\t"addr2"\t"city"\t"state"\t"zip
}
{
if (NR == 1) {
for(i=1;i<=NF;i++) {
if ($i == addr1Label){ address1 = i }
if ($i == addr2Label){ address2 = i }
}
}
{
if (NR == 1) {
if (address1 > 0){$address1 = "addr1_line1\taddr1_line2\taddr1_city\taddr1_state\taddr1_zip"}
if (address2 > 0){$address2 = "addr2_line1\taddr2_line2\taddr2_city\taddr2_state\taddr2_zip"}
} else {
if (address1 > 0){$address1=fixaddr($address1)}
if (address2 > 0){$address2=fixaddr($address2)}
}
print $0 > outfile
if(NR % 1000 == 0 ) {printf ("%d records processed\r", NR) }
}
}
END {print NR" records processed and sent to "outfile}