-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSQL cleaning data.sql
More file actions
128 lines (87 loc) · 4.07 KB
/
SQL cleaning data.sql
File metadata and controls
128 lines (87 loc) · 4.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
/*
Data Cleaning Using Microsoft SQL Server Management Studio
*/
---------------------------------------------------------------------------------------------------------------------------------
-- Select all columns to view the data and identify where to clean
SELECT
*
FROM
housingdata;
---------------------------------------------------------------------------------------------------------------------------------
-- The SaleDate column is in MMDDYYY format; The standard format for SQL is YYYYMMDD
-- Modify the column by using the 'Convert' function
UPDATE housingdata
set SaleDate=CONVERT(date,SaleDate);
---------------------------------------------------------------------------------------------------------------------------------
-- The PropertyAddress column has null values; there are empty fields in that column
-- Populate the missing fields by comparing the ParcelID and the UniqueID of the PropertyAddress
UPDATE a
-- Using the alias 'a' to avoid any errors
SET PropertyAddress=ISNULL(a.PropertyAddress, b.PropertyAddress)
FROM housingdata a
JOIN housingdata b
ON a.ParcelID = b.ParcelID
AND a.[UniqueID ] <> b.[UniqueID ]
WHERE a.PropertyAddress IS NULL;
---------------------------------------------------------------------------------------------------------------------------------
-- Split the PropertyAddress column into different columns, i.e, address, city and state
-- Create new columns to house the data split from the ProperyAddress column
ALTER TABLE housingdata
ADD Property_Address NVARCHAR(255);
ALTER TABLE housingdata
ADD Property_City NVARCHAR(255);
-- Split the data into the new columns using ',' as the delimiter to split data in PropertyAddress columns
UPDATE housingdata
SET Property_Address=
SUBSTRING(PropertyAddress, 1, CHARINDEX(',', PropertyAddress) -1);
UPDATE housingdata
SET Property_City=
SUBSTRING(PropertyAddress, CHARINDEX(',', PropertyAddress) +1, LEN(PropertyAddress));
---------------------------------------------------------------------------------------------------------------------------------
-- The OwnerAddress column is similar to the PropertyAddress, Split the OwnerAddress column into address, city and state
-- Create three new columns to house the split data
ALTER TABLE housingdata
ADD Owner_Address NVARCHAR(255);
ALTER TABLE housingdata
ADD Owner_City NVARCHAR(255);
ALTER TABLE housingdata
ADD Owner_State NVARCHAR(255);
-- Split the data into the newly created column
UPDATE housingdata
SET Owner_Address=PARSENAME(REPLACE(OwnerAddress, ',', '.') , 3);
UPDATE housingdata
SET Owner_City=PARSENAME(REPLACE(OwnerAddress, ',', '.') , 2);
UPDATE housingdata
SET Owner_State=PARSENAME(REPLACE(OwnerAddress, ',', '.') , 1);
---------------------------------------------------------------------------------------------------------------------------------
-- The SoldAsVacant Column has four different entries; Yes, No, Y and N.
-- For every Y replace with Yes, and for every N replace with No
UPDATE housingdata
SET SoldAsVacant=CASE
WHEN SoldAsVacant = 'Y' THEN 'YES'
WHEN SoldAsVacant = 'N' THEN 'NO'
ELSE SoldAsVacant
END
---------------------------------------------------------------------------------------------------------------------------------
-- Remove Duplicates using CTE
WITH temphousingdata AS
(SELECT *, ROW_NUMBER() OVER
(PARTITION BY ParcelID,
PropertyAddress,
SalePrice,
SaleDate,
LegalReference
ORDER BY
UniqueID
) row_num
FROM housingdata
)
-- The temp table allows us to see the duplicates in the dataset
DELETE
FROM temphousingdata
WHERE row_num > 1;
---------------------------------------------------------------------------------------------------------------------------------
-- Delete unused columns in the dataset
ALTER TABLE housingdata
DROP COLUMN OwnerAddress, PropertyAddress, TaxDistrict;
---------------------------------------------------------------------------------------------------------------------------------