From 5dc17d9f439e96667f2ef965ecf05c2c2b753042 Mon Sep 17 00:00:00 2001 From: Till Krenz Date: Wed, 7 Nov 2018 18:06:00 -0500 Subject: [PATCH] Improved matching by address The first part of the address matching test should be: !is.na(name.df$address) (! was missing). Also instead of going for an exact match in the second part of the test I suggest using the Jarowinkler distance with a high similarity threshold in order to match up addresses, that differ only in details, maybe a value higher than 0.9 is advisable. --- R/authors_match.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/authors_match.R b/R/authors_match.R index 70d2dca..92d9cc0 100644 --- a/R/authors_match.R +++ b/R/authors_match.R @@ -164,7 +164,7 @@ for (p in unique.groupid) { match2 <- (!is.na(novel.names1$university) & !is.na(name.df$university)) & name.df$university == novel.names1$university # match middle initial match3 <- !is.na(name.df$m.i) & novel.names1$m.i == name.df$m.i - match4 <- is.na(name.df$address) & novel.names1$address == name.df$address + match4 <- !is.na(name.df$address) & RecordLinkage::jarowinkler(name.df$address, novel.names1$address) > 0.9 # match emails # if(nrow(novel.names1)==0){match1<-F;match2<-F;match3<-F} if (sum(ifelse(is.na(c(match1, match2, match3, match4)), FALSE, c(match1, match2, match3, match4))) > 0) {