This project is to build a model for nacrcotics that suffers from selection bias. This model is for Chicago. Two new risk factors: envrionment complaints and shot spotter alerts are added to the model.
policeDistricts <-
st_read("https://data.cityofchicago.org/api/geospatial/fthy-xz3r?method=export&format=GeoJSON") %>%
st_transform('ESRI:102271') %>%
dplyr::select(District = dist_num)
policeBeats <-
st_read("https://data.cityofchicago.org/api/geospatial/aerh-rz74?method=export&format=GeoJSON") %>%
st_transform('ESRI:102271') %>%
dplyr::select(District = beat_num)
bothPoliceUnits <- rbind(mutate(policeDistricts, Legend = "Police Districts"),
mutate(policeBeats, Legend = "Police Beats"))
crimes <- # general name can be used for other primary type later
read.socrata("https://data.cityofchicago.org/Public-Safety/Crimes-2020/qzdf-xmn8") %>% #change some filtering
filter(Primary.Type == "NARCOTICS") %>%
mutate(x = gsub("[()]", "", Location)) %>%
separate(x,into= c("Y","X"), sep=",") %>%
mutate(X = as.numeric(X),Y = as.numeric(Y)) %>%
na.omit() %>%
st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant")%>%
st_transform('ESRI:102271') %>%
distinct()
#socrata data download platform
chicagoBoundary <-
st_read(file.path(root.dir,"/Chapter5/chicagoBoundary.geojson")) %>%
st_transform('ESRI:102271')
This is a narcotics crime map of Chicago in 2017.
It is argued
that these racial disparities in drug sanctioning are better explained
by the policy decision to dramatically increase the number of arrests of
street-level drug offenders, the more public nature of African-American
drug offending, and cultural stereotypes that link African-Americans to
drug crimes.
After creating a fishnet…
## using {sf} to create the grid
## Note the `.[chicagoBoundary] %>% ` line. This is needed to clip the grid to our data
fishnet <-
st_make_grid(chicagoBoundary,
cellsize = 500, # in meters 10
square = TRUE) %>%
.[chicagoBoundary] %>% # fast way to select intersecting polygons
st_sf() %>%
mutate(uniqueID = 1:n())
## add a value of 1 to each crime, sum them with aggregate
crime_net <-
dplyr::select(crimes) %>%
mutate(countCrimes = 1) %>%
aggregate(., fishnet, sum) %>%
mutate(countCrimes = replace_na(countCrimes, 0),
uniqueID = 1:n(),
cvID = sample(round(nrow(fishnet) / 24), #
size=nrow(fishnet), replace = TRUE))
ggplot() +
geom_sf(data = crime_net, aes(fill = countCrimes), color = NA) +
scale_fill_viridis() +
labs(title = "Count of Crimes for the fishnet") +
mapTheme()
Then downloading all Modeling Spatial Features…
## only pulling a single variable for our model to keep it simple
## using Socrata again
abandonCars <-
read.socrata("https://data.cityofchicago.org/Service-Requests/311-Service-Requests-Abandoned-Vehicles/3c9v-pnva") %>%
mutate(year = substr(creation_date,1,4)) %>% filter(year == "2017") %>%
dplyr::select(Y = latitude, X = longitude) %>%
na.omit() %>%
st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
st_transform(st_crs(fishnet)) %>%
mutate(Legend = "Abandoned_Cars")
abandonBuildings <-
read.socrata("https://data.cityofchicago.org/Service-Requests/311-Service-Requests-Vacant-and-Abandoned-Building/7nii-7srd") %>%
mutate(year = substr(date_service_request_was_received,1,4)) %>% filter(year == "2017") %>%
dplyr::select(Y = latitude, X = longitude) %>%
na.omit() %>%
st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
st_transform(st_crs(fishnet)) %>%
mutate(Legend = "Abandoned_Buildings")
shotspotterAlerts <-
read.socrata("https://data.cityofchicago.org/Public-Safety/Violence-Reduction-Shotspotter-Alerts/3h7q-7mdb") %>%
mutate(year = substr(date,1,4)) %>% filter(year == "2017") %>%
dplyr::select(Y = latitude, X = longitude) %>%
na.omit() %>%
st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
st_transform(st_crs(fishnet)) %>%
mutate(Legend = "Shotspotter_Alerts")
envcomplaint <-
read.socrata("https://data.cityofchicago.org/Environment-Sustainable-Development/CDPH-Environmental-Complaints/fypr-ksnz") %>%
mutate(year = substr(complaint_date,1,4)) %>% filter(year == "2017") %>%
dplyr::select(Y = latitude, X = longitude) %>%
na.omit() %>%
st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
st_transform(st_crs(fishnet)) %>%
mutate(Legend = "Environment_Complaints")
## Neighborhoods to use in LOOCV in a bit
neighborhoods <-
st_read("https://raw.githubusercontent.com/blackmad/neighborhoods/master/chicago.geojson") %>%
st_transform(st_crs(fishnet))
and aggregate features to our fishnet…
vars_net <-
rbind(abandonCars,abandonBuildings,
envcomplaint, shotspotterAlerts) %>%
st_join(fishnet, join=st_within) %>%
st_drop_geometry() %>%
group_by(uniqueID, Legend) %>%
summarize(count = n()) %>% #important
left_join(fishnet, ., by = "uniqueID") %>%
spread(Legend, count, fill=0) %>%
dplyr::select(-`<NA>`) %>%
na.omit() %>%
ungroup() #will influence other operation
vars_net.long <-
gather(vars_net, Variable, value, -geometry, -uniqueID)
vars <- unique(vars_net.long$Variable)
mapList <- list()
for(i in vars){
mapList[[i]] <-
ggplot() +
geom_sf(data = filter(vars_net.long, Variable == i), aes(fill=value), colour=NA) +
scale_fill_viridis(name="") +
labs(title=i) +
mapTheme()}
do.call(grid.arrange,c(mapList, ncol=2,nrow=2, top="Risk Factors by Fishnet"))
add Nearest Neighbor Feature…
st_c <- st_coordinates
st_coid <- st_centroid
## create NN from abandoned cars
vars_net <- vars_net %>%
mutate(
Abandoned_Buildings.fe =
nn_function(st_c(st_coid(vars_net)), st_c(abandonBuildings),k=3),
Abandoned_Cars.fe =
nn_function(st_c(st_coid(vars_net)), st_c(abandonCars),k=3),
shotspotter_alert.fe =
nn_function(st_c(st_coid(vars_net)), st_c(shotspotterAlerts),k=3),
env_complaints.fe =
nn_function(st_c(st_coid(vars_net)), st_c(envcomplaint),k=3))
## Visualize the NN feature
vars_net_textbook <-
dplyr::select(vars_net, ends_with("s.fe")) %>%
gather(Variable, value, -geometry)
vars_net_add <-
dplyr::select(vars_net, ends_with("t.fe")) %>%
gather(Variable, value, -geometry)
ggplot() +
geom_sf(data = vars_net_textbook, aes(fill=value), colour=NA) +
scale_fill_viridis(name="NN Distance") +
labs(title="Abandoned Car NN Distance") +
facet_wrap(~Variable) +
mapTheme()
ggplot() +
geom_sf(data = vars_net_add, aes(fill=value), colour=NA) +
scale_fill_viridis(name="NN Distance") +
labs(title="Abandoned Car NN Distance") +
facet_wrap(~Variable) +
mapTheme()
Join NN feature to our fishnet…
Since the counts were aggregated to each cell by
uniqueID we can use that to join the counts to the
fishnet.
## important to drop the geometry from joining features
final_net <-
left_join(crime_net, st_drop_geometry(vars_net), by="uniqueID")
Join in areal data…
final_net <-
st_centroid(final_net) %>%
st_join(dplyr::select(neighborhoods, name), by = "uniqueID") %>% # if touch, add name
st_join(dplyr::select(policeDistricts, District), by = "uniqueID") %>%
st_drop_geometry() %>%
left_join(dplyr::select(final_net, geometry, uniqueID)) %>%
st_sf() %>%
na.omit()
# for live demo
mapview::mapview(final_net, zcol = "District")
## generates warnings from PROJ issues
## {spdep} to make polygon to neighborhoods...
final_net.nb <- poly2nb(as_Spatial(final_net), queen=TRUE)
## ... and neighborhoods to list of weigths
final_net.weights <- nb2listw(final_net.nb, style="W", zero.policy=TRUE)
#print(final_net.weights, zero.policy=TRUE)
## see ?localmoran
local_morans <- localmoran(final_net$countCrimes, final_net.weights, zero.policy=TRUE) %>%
as.data.frame()
# join local Moran's I results to fishnet
final_net.localMorans <-
cbind(local_morans, as.data.frame(final_net)) %>%
st_sf() %>%
dplyr::select(crime_count = countCrimes,
Local_Morans_I = Ii,
P_Value = `Pr(z != E(Ii))`) %>%
mutate(Significant_Hotspots = ifelse(P_Value <= 0.001, 1, 0)) %>%
gather(Variable, Value, -geometry)
##Plotting local Moran’s I results…
## This is just for plotting
vars <- unique(final_net.localMorans$Variable)
varList <- list()
for(i in vars){
varList[[i]] <-
ggplot() +
geom_sf(data = filter(final_net.localMorans, Variable == i),
aes(fill = Value), colour=NA) +
scale_fill_viridis(name="") +
labs(title=i) +
mapTheme(title_size = 14) + theme(legend.position="bottom")}
do.call(grid.arrange,c(varList, ncol = 4, top = "Local Morans I statistics, Crimes"))
Distance to Hot spot…
Using NN distance to a hot spot location
# generates warning from NN
final_net <- final_net %>%
mutate(crime.isSig =
ifelse(local_morans[,5] <= 0.001, 1, 0)) %>%
mutate(crime.isSig.dist =
nn_function(st_c(st_coid(final_net)),
st_c(st_coid(filter(final_net,
crime.isSig == 1))),
k = 1))
ggplot() +
geom_sf(data = final_net, aes(fill=crime.isSig.dist), colour=NA) +
scale_fill_viridis(name="significant distance") +
labs(title="distance to highly significant hot pot") +
mapTheme()
Leave One Group Out CV on spatial features # modeling start
# View(crossValidate)
## define the variables we want
reg.ss.vars <- c("Abandoned_Cars.fe","Abandoned_Buildings.fe","shotspotter_alert.fe","env_complaints.fe", "crime.isSig.dist")
## RUN REGRESSIONS
reg.ss.spatialCV <- crossValidate(
dataset = final_net,
id = "name",
dependentVariable = "countCrimes",
indVariables = reg.ss.vars) %>%
dplyr::select(cvID = name, countCrimes, Prediction, geometry)
# calculate errors by NEIGHBORHOOD
error_by_reg_and_fold <-
reg.ss.spatialCV %>%
group_by(cvID) %>%
summarize(Mean_Error = mean(Prediction - countCrimes, na.rm = T),
MAE = mean(abs(Mean_Error), na.rm = T),
SD_MAE = mean(abs(Mean_Error), na.rm = T)) %>%
ungroup()
error_by_reg_and_fold %>%
arrange(desc(MAE))
## Simple feature collection with 96 features and 4 fields
## Geometry type: GEOMETRY
## Dimension: XY
## Bounding box: xmin: 341164.6 ymin: 552874.5 xmax: 367164.6 ymax: 594874.5
## Projected CRS: NAD_1983_HARN_StatePlane_Illinois_East_FIPS_1201
## # A tibble: 96 × 5
## cvID Mean_Error MAE SD_MAE geometry
## <chr> <dbl> <dbl> <dbl> <POLYGON [m]>
## 1 Garfield Park -25.7 25.7 25.7 ((350164.6 577874.5, 349664.6 5778…
## 2 Ukrainian Village 12.0 12.0 12.0 ((353164.6 580874.5, 353164.6 5813…
## 3 Humboldt Park -9.87 9.87 9.87 ((349164.6 581874.5, 349164.6 5823…
## 4 Loop -7.04 7.04 7.04 ((357664.6 579374.5, 357664.6 5798…
## 5 Printers Row -6.42 6.42 6.42 ((358664.6 577374.5, 358164.6 5773…
## 6 United Center 5.94 5.94 5.94 ((353164.6 579374.5, 353164.6 5798…
## 7 Mckinley Park 5.49 5.49 5.49 ((353664.6 573874.5, 354164.6 5738…
## 8 North Lawndale -5.07 5.07 5.07 ((349164.6 576874.5, 349164.6 5773…
## 9 Lower West Side 4.58 4.58 4.58 ((353664.6 574874.5, 353664.6 5753…
## 10 Wicker Park 4.46 4.46 4.46 ((353164.6 582374.5, 353164.6 5828…
## # … with 86 more rows
error_by_reg_and_fold %>%
arrange(MAE)
## Simple feature collection with 96 features and 4 fields
## Geometry type: GEOMETRY
## Dimension: XY
## Bounding box: xmin: 341164.6 ymin: 552874.5 xmax: 367164.6 ymax: 594874.5
## Projected CRS: NAD_1983_HARN_StatePlane_Illinois_East_FIPS_1201
## # A tibble: 96 × 5
## cvID Mean_Error MAE SD_MAE geometry
## <chr> <dbl> <dbl> <dbl> <GEOMETRY [m]>
## 1 West Loop -0.00187 0.00187 0.00187 POLYGON ((355164.6 579874.5…
## 2 Norwood Park -0.00448 0.00448 0.00448 MULTIPOLYGON (((342164.6 58…
## 3 Jefferson Park -0.0140 0.0140 0.0140 POLYGON ((345164.6 589874.5…
## 4 North Park 0.0224 0.0224 0.0224 POLYGON ((350164.6 589374.5…
## 5 Hegewisch 0.0309 0.0309 0.0309 POLYGON ((364164.6 552874.5…
## 6 Avalon Park -0.0319 0.0319 0.0319 POLYGON ((361164.6 563874.5…
## 7 Museum Campus 0.0460 0.0460 0.0460 MULTIPOLYGON (((359664.6 57…
## 8 South Deering 0.0606 0.0606 0.0606 POLYGON ((362164.6 554874.5…
## 9 Sauganash,Forest Glen 0.0808 0.0808 0.0808 POLYGON ((346664.6 591874.5…
## 10 Clearing 0.0868 0.0868 0.0868 POLYGON ((344164.6 568374.5…
## # … with 86 more rows
## plot histogram of OOF (out of fold) errors
error_by_reg_and_fold %>%
ggplot(aes(MAE)) +
geom_histogram(bins = 30, colour="black", fill = "#FDE725FF") +
scale_x_continuous(breaks = seq(0, 11, by = 1)) +
labs(title="Distribution of MAE", subtitle = "LOGO-CV",
x="Mean Absolute Error", y="Count")
# demo of kernel width
crime_ppp <- as.ppp(st_coordinates(crimes), W = st_bbox(final_net))
crime_KD.1000 <- spatstat.core::density.ppp(crime_ppp, 1000)
crime_KD.1500 <- spatstat.core::density.ppp(crime_ppp, 1500)
crime_KD.2000 <- spatstat.core::density.ppp(crime_ppp, 2000)
crime_KD.df <- rbind(
mutate(data.frame(rasterToPoints(mask(raster(crime_KD.1000), as(neighborhoods, 'Spatial')))), Legend = "1000 Ft."),
mutate(data.frame(rasterToPoints(mask(raster(crime_KD.1500), as(neighborhoods, 'Spatial')))), Legend = "1500 Ft."),
mutate(data.frame(rasterToPoints(mask(raster(crime_KD.2000), as(neighborhoods, 'Spatial')))), Legend = "2000 Ft."))
crime_KD.df$Legend <- factor(crime_KD.df$Legend, levels = c("1000 Ft.", "1500 Ft.", "2000 Ft."))
ggplot(data=crime_KD.df, aes(x=x, y=y)) +
geom_raster(aes(fill=layer)) +
facet_wrap(~Legend) +
coord_sf(crs=st_crs(final_net)) +
scale_fill_viridis(name="Density") +
labs(title = "Kernel density with 3 different search radii") +
mapTheme(title_size = 14)
as.data.frame(crime_KD.1000) %>%
st_as_sf(coords = c("x", "y"), crs = st_crs(final_net)) %>%
aggregate(., final_net, mean) %>%
ggplot() +
geom_sf(aes(fill=value)) +
geom_sf(data = sample_n(crimes, 1500), size = .5) +
scale_fill_viridis(name = "Density") +
labs(title = "Kernel density of 2017 crimes") +
mapTheme(title_size = 14)
correlation.long <-
st_drop_geometry(final_net) %>%
dplyr::select(-uniqueID, -cvID, -name, -District) %>%
gather(Variable, Value, -countCrimes)
correlation.cor <-
correlation.long %>%
group_by(Variable) %>%
summarize(correlation = cor(Value, countCrimes, use = "complete.obs"))
ggplot(correlation.long, aes(Value, countCrimes)) +
geom_point(size = 0.1) +
geom_text(data = correlation.cor, aes(label = paste("r =", round(correlation, 2))),
x=-Inf, y=Inf, vjust = 1.5, hjust = -.1) +
geom_smooth(method = "lm", se = FALSE, colour = "black") +
facet_wrap(~Variable, ncol = 2, scales = "free") +
labs(title = "Crime count as a function of risk factors") +
plotTheme()
reg.vars <- c("Abandoned_Buildings.fe", "Abandoned_Cars.fe","shotspotter_alert.fe",
"env_complaints.fe","crime.isSig", "crime.isSig.dist")
reg.ss.vars <- c("Abandoned_Buildings.fe", "Abandoned_Cars.fe","shotspotter_alert.fe",
"env_complaints.fe","crime.isSig", "crime.isSig.dist")
reg.cv <- crossValidate(
dataset = final_net,
id = "cvID",
dependentVariable = "countCrimes",
indVariables = reg.vars) %>%
dplyr::select(cvID = cvID, countCrimes, Prediction, geometry)
reg.ss.cv <- crossValidate(
dataset = final_net,
id = "cvID",
dependentVariable = "countCrimes",
indVariables = reg.ss.vars) %>%
dplyr::select(cvID = cvID, countCrimes, Prediction, geometry)
reg.spatialCV <- crossValidate(
dataset = final_net,
id = "name",
dependentVariable = "countCrimes",
indVariables = reg.vars) %>%
dplyr::select(cvID = name, countCrimes, Prediction, geometry)
reg.ss.spatialCV <- crossValidate(
dataset = final_net,
id = "name",
dependentVariable = "countCrimes",
indVariables = reg.ss.vars) %>%
dplyr::select(cvID = name, countCrimes, Prediction, geometry)
reg.summary <-
rbind(
mutate(reg.cv, Error = Prediction - countCrimes,
Regression = "Random k-fold CV: Just Risk Factors"),
mutate(reg.ss.cv, Error = Prediction - countCrimes,
Regression = "Random k-fold CV: Spatial Process"),
mutate(reg.spatialCV, Error = Prediction - countCrimes,
Regression = "Spatial LOGO-CV: Just Risk Factors"),
mutate(reg.ss.spatialCV, Error = Prediction - countCrimes,
Regression = "Spatial LOGO-CV: Spatial Process")) %>%
st_sf()
error_by_reg_and_fold <-
reg.summary %>%
group_by(Regression, cvID) %>%
summarize(Mean_Error = mean(Prediction - countCrimes, na.rm = T),
MAE = mean(abs(Mean_Error), na.rm = T),
SD_MAE = mean(abs(Mean_Error), na.rm = T)) %>%
ungroup()
error_by_reg_and_fold %>%
ggplot(aes(MAE)) +
geom_histogram(bins = 30, colour="black", fill = "#FDE725FF") +
facet_wrap(~Regression) +
geom_vline(xintercept = 0) + scale_x_continuous(breaks = seq(0, 8, by = 1)) +
labs(title="Distribution of MAE", subtitle = "k-fold cross validation vs. LOGO-CV",
x="Mean Absolute Error", y="Count") +
plotTheme()
## A table of raw errors by race context for a random k-fold vs. spatial
cross validation regression.
st_drop_geometry(error_by_reg_and_fold) %>%
group_by(Regression) %>%
summarize(Mean_MAE = round(mean(MAE), 2),
SD_MAE = round(sd(MAE), 2)) %>%
kable() %>%
kable_styling("striped", full_width = F) %>%
row_spec(2, color = "black", background = "#FDE725FF") %>%
row_spec(4, color = "black", background = "#FDE725FF")
| Regression | Mean_MAE | SD_MAE |
|---|---|---|
| Random k-fold CV: Just Risk Factors | 1.35 | 1.74 |
| Random k-fold CV: Spatial Process | 1.35 | 1.74 |
| Spatial LOGO-CV: Just Risk Factors | 1.23 | 1.33 |
| Spatial LOGO-CV: Spatial Process | 1.23 | 1.33 |
error_by_reg_and_fold %>%
filter(str_detect(Regression, "LOGO")) %>%
ggplot() +
geom_sf(aes(fill = MAE)) +
facet_wrap(~Regression) +
scale_fill_viridis() +
labs(title = "Crime errors by LOGO-CV Regression") +
mapTheme() + theme(legend.position="bottom")
neighborhood.weights <-
filter(error_by_reg_and_fold, Regression == "Spatial LOGO-CV: Spatial Process") %>%
group_by(cvID) %>%
poly2nb(as_Spatial(.), queen=TRUE) %>%
nb2listw(., style="W", zero.policy=TRUE)
filter(error_by_reg_and_fold, str_detect(Regression, "LOGO")) %>%
st_drop_geometry() %>%
group_by(Regression) %>%
summarize(Morans_I = moran.mc(abs(Mean_Error), neighborhood.weights,
nsim = 999, zero.policy = TRUE,
na.action=na.omit)[[1]],
p_value = moran.mc(abs(Mean_Error), neighborhood.weights,
nsim = 999, zero.policy = TRUE,
na.action=na.omit)[[3]])
## # A tibble: 2 × 3
## Regression Morans_I p_value
## <chr> <dbl> <dbl>
## 1 Spatial LOGO-CV: Just Risk Factors 0.184 0.006
## 2 Spatial LOGO-CV: Spatial Process 0.184 0.005
Get 2018 crime data…
crimes18 <-
read.socrata("https://data.cityofchicago.org/Public-Safety/Crimes-2018/3i3m-jwuy") %>%
filter(Primary.Type == "NARCOTICS") %>%
mutate(x = gsub("[()]", "", Location)) %>%
separate(x,into= c("Y","X"), sep=",") %>%
mutate(X = as.numeric(X),
Y = as.numeric(Y)) %>%
na.omit %>%
st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
st_transform('ESRI:102271') %>%
distinct() %>%
.[fishnet,]
crime_KDE_sum <- as.data.frame(crime_KD.1000) %>%
st_as_sf(coords = c("x", "y"), crs = st_crs(final_net)) %>%
aggregate(., final_net, mean)
kde_breaks <- classIntervals(crime_KDE_sum$value,
n = 5, "fisher")
crime_KDE_sf <- crime_KDE_sum %>%
mutate(label = "Kernel Density",
Risk_Category = classInt::findCols(kde_breaks),
Risk_Category = case_when(
Risk_Category == 5 ~ "5th",
Risk_Category == 4 ~ "4th",
Risk_Category == 3 ~ "3rd",
Risk_Category == 2 ~ "2nd",
Risk_Category == 1 ~ "1st")) %>%
cbind(
aggregate(
dplyr::select(crimes18) %>% mutate(crimeCount = 1), ., sum) %>%
mutate(crimeCount = replace_na(crimeCount, 0))) %>%
dplyr::select(label, Risk_Category, crimeCount)
ml_breaks <- classIntervals(reg.ss.spatialCV$Prediction,
n = 5, "fisher")
crime_risk_sf <-
reg.ss.spatialCV %>%
mutate(label = "Risk Predictions",
Risk_Category =classInt::findCols(ml_breaks),
Risk_Category = case_when(
Risk_Category == 5 ~ "5th",
Risk_Category == 4 ~ "4th",
Risk_Category == 3 ~ "3rd",
Risk_Category == 2 ~ "2nd",
Risk_Category == 1 ~ "1st")) %>%
cbind(
aggregate(
dplyr::select(crimes18) %>% mutate(crimeCount = 1), ., sum) %>%
mutate(crimeCount = replace_na(crimeCount, 0))) %>%
dplyr::select(label,Risk_Category, crimeCount)
rbind(crime_KDE_sf, crime_risk_sf) %>%
na.omit() %>%
gather(Variable, Value, -label, -Risk_Category, -geometry) %>%
ggplot() +
geom_sf(aes(fill = Risk_Category), colour = NA) +
geom_sf(data = sample_n(crimes18, 3000), size = .5, colour = "black") +
facet_wrap(~label, ) +
scale_fill_viridis(discrete = TRUE) +
labs(title="Comparison of Kernel Density and Risk Predictions",
subtitle="2017 crime risk predictions; 2018 crimes") +
mapTheme(title_size = 14)
rbind(crime_KDE_sf, crime_risk_sf) %>%
st_drop_geometry() %>%
na.omit() %>%
gather(Variable, Value, -label, -Risk_Category) %>%
group_by(label, Risk_Category) %>%
summarize(countCrimes = sum(Value)) %>%
ungroup() %>%
group_by(label) %>%
mutate(Pcnt_of_test_set_crimes = countCrimes / sum(countCrimes)) %>%
ggplot(aes(Risk_Category,Pcnt_of_test_set_crimes)) +
geom_bar(aes(fill=label), position="dodge", stat="identity") +
scale_fill_viridis(discrete = TRUE, name = "Model") +
labs(title = "Risk prediction vs. Kernel density, 2018 crimes",
y = "% of Test Set crimes (per model)",
x = "Risk Category") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, vjust = 0.5))
The existing empirical evidence suggests that the racial disparities
exacerbated by the war on drugs are more likely due to political
expediency and racialized politics. It is argued that these racial
disparities in drug sanctioning are better explained by the policy
decision to dramatically increase the number of arrests of street-level
drug offenders, the more public nature of African-American drug
offending, and cultural stereotypes that link African-Americans to drug
crimes.
A geospatial risk prediction model borrows the narcotics
experience in places where it has been observed and tests whether that
experience generalizes to places where risk may be high, despite few
actual events. It’s pretty sure the model suffers from some level of
bias. If law enforcement systematically over-polices certain
communities, and this selection criteria goes unaccounted for in the
model, then the model may be biased regardless of the above tests.
I would not recommend this algorithm be put into production.
There are questions in the Textbook: “What if the $10 million in savings
lead police to increase enforcement and surveillance disproportionately
in Black and Brown communities? Worse, what about feedback effects where
steering police to these neighborhoods cause more reported crime, which
then leads to increased predicted risk?”
I agree that the right
approach is a function of community standards. Machine learning could be
the next logical progression in analytics, but it also needs to avoid
being a tool of the surveillance state.