UMKC Center for Health Insights
Earl F Glynn
2013-12-12. Last modified 2015-01-04.
https://data.kcmo.org/311/KCMOPS311-Data/7at3-sxhp
2015-01-03
The current file has 819903 rows by 27 columns.
library(stringr)
x <- d$ADDRESS.WITH.GEOCODE
splits <- strsplit(x, "\\(|,|)")
d$latitude <- as.numeric(str_trim(unlist(lapply(splits, "[", 3))))
d$longitude <- as.numeric(str_trim(unlist(lapply(splits, "[", 4))))
Let’s look at raw plots to discover problems in the data.
Because of large number of points, let’s create PNG graphics and display them.
png("figures/LatLongPlotsRaw%d.png")
smoothScatter(d$longitude, d$XCoordinate, main="XCoordinate vs Longitude (raw)")
## Warning in KernSmooth::bkde2D(x, bandwidth = bandwidth, gridsize = nbin, : Binning grid too coarse for current (small)
## bandwidth: consider increasing 'gridsize'
smoothScatter(d$longitude, d$XCoordinate, main="XCoordinate vs Longitude (raw)", xlim=c(-94.8,-94.3))
grid()
smoothScatter(d$latitude, d$YCoordinate, main="YCoordinate vs Latitude (raw)")
## Warning in KernSmooth::bkde2D(x, bandwidth = bandwidth, gridsize = nbin, : Binning grid too coarse for current (small)
## bandwidth: consider increasing 'gridsize'
smoothScatter(d$latitude, d$YCoordinate, main="YCoordinate vs Latitude (raw)", xlim=c(38.4, 39.4))
smoothScatter(d$latitude, d$YCoordinate, main="YCoordinate vs Latitude (raw)", xlim=c(-94.7,-94.3))
grid()
dev.off()
## pdf
## 2
A number of XCoordinate and YCoordinate values are zero:
zero.coord <- d$XCoordinate == 0 | d$YCoordinate == 0
sum(zero.coord)
## [1] 40621
Latitude and Longitude sometimes is missing:
missing.lat.long <- is.na(d$latitude) | is.na(d$longitude)
sum(missing.lat.long)
## [1] 27632
Some Latitude and Longitude values need cleanup based on the raw plots above.
problem.lat <- (d$latitude > 39.4) | (d$latitude < 38.8)
sum(problem.lat, na.rm=TRUE)
## [1] 18296
problem.lat[is.na(problem.lat)] <- FALSE
sum(problem.lat)
## [1] 18296
problem.long <- (d$longitude > -94.0)
sum(problem.long, na.rm=TRUE)
## [1] 2
problem.long[is.na(problem.long)] <- FALSE
sum(problem.long)
## [1] 2
For now, reject all problem records and plot charts with only “good” data.
problems <- zero.coord | missing.lat.long | problem.lat | problem.long
sum(problems)
## [1] 68255
png("figures/LatLongPlots%d.png")
smoothScatter(d$latitude[!problems], d$YCoordinate[!problems], main="YCoordinate vs Latitude")
grid()
fit.X.lat <- lm(d$YCoordinate[!problems] ~ d$latitude[!problems])
abline(fit.X.lat, col="red", lwd=3, lty="dotted")
summary(fit.X.lat)
##
## Call:
## lm(formula = d$YCoordinate[!problems] ~ d$latitude[!problems])
##
## Residuals:
## Min 1Q Median 3Q Max
## -130849 -633 -325 -8 158517
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.296e+07 3.111e+03 -4166 <2e-16 ***
## d$latitude[!problems] 3.588e+05 7.963e+01 4506 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6581 on 751646 degrees of freedom
## Multiple R-squared: 0.9643, Adjusted R-squared: 0.9643
## F-statistic: 2.03e+07 on 1 and 751646 DF, p-value: < 2.2e-16
smoothScatter(d$longitude[!problems], d$XCoordinate[!problems], main="XCoordinate vs Longitude")
grid()
fit.X.long <- lm(d$XCoordinate[!problems] ~ d$longitude[!problems])
abline(fit.X.long, col="red", lwd=3, lty="dotted")
summary(fit.X.long)
##
## Call:
## lm(formula = d$XCoordinate[!problems] ~ d$longitude[!problems])
##
## Residuals:
## Min 1Q Median 3Q Max
## -66018 -174 -45 74 77996
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.930e+07 3.943e+03 7431 <2e-16 ***
## d$longitude[!problems] 2.806e+05 4.171e+01 6727 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1623 on 751646 degrees of freedom
## Multiple R-squared: 0.9837, Adjusted R-squared: 0.9837
## F-statistic: 4.525e+07 on 1 and 751646 DF, p-value: < 2.2e-16
dev.off()
## pdf
## 2
Regression lines on the charts above suggest linear relationships between the variable pairs.