From 5dc17d9f439e96667f2ef965ecf05c2c2b753042 Mon Sep 17 00:00:00 2001
From: Till Krenz <tilltnet@users.noreply.github.com>
Date: Wed, 7 Nov 2018 18:06:00 -0500
Subject: [PATCH] Improved matching by address

The first part of the address matching test should be: !is.na(name.df$address) (! was missing). Also instead of going for an exact match in the second part of the test I suggest using the Jarowinkler distance with a high similarity threshold in order to match up addresses, that differ only in details, maybe a value higher than 0.9 is advisable.
---
 R/authors_match.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/authors_match.R b/R/authors_match.R
index 70d2dca..92d9cc0 100644
--- a/R/authors_match.R
+++ b/R/authors_match.R
@@ -164,7 +164,7 @@ for (p in unique.groupid) {
     match2 <- (!is.na(novel.names1$university) & !is.na(name.df$university)) & name.df$university == novel.names1$university
     # match middle initial
     match3 <- !is.na(name.df$m.i) & novel.names1$m.i == name.df$m.i
-    match4 <- is.na(name.df$address) & novel.names1$address == name.df$address
+    match4 <- !is.na(name.df$address) & RecordLinkage::jarowinkler(name.df$address, novel.names1$address) > 0.9
     # match emails
     # if(nrow(novel.names1)==0){match1<-F;match2<-F;match3<-F}
     if (sum(ifelse(is.na(c(match1, match2, match3, match4)), FALSE, c(match1, match2, match3, match4))) > 0) {