This project is to build a model for nacrcotics that suffers from selection bias. This model is for Chicago. Two new risk factors: envrionment complaints and shot spotter alerts are added to the model.

policeDistricts <- 
  st_read("https://data.cityofchicago.org/api/geospatial/fthy-xz3r?method=export&format=GeoJSON") %>%
  st_transform('ESRI:102271') %>%
  dplyr::select(District = dist_num)
  
policeBeats <- 
  st_read("https://data.cityofchicago.org/api/geospatial/aerh-rz74?method=export&format=GeoJSON") %>%
  st_transform('ESRI:102271') %>%
  dplyr::select(District = beat_num)

bothPoliceUnits <- rbind(mutate(policeDistricts, Legend = "Police Districts"), 
                         mutate(policeBeats, Legend = "Police Beats"))

crimes <-  # general name can be used for other primary type later
  read.socrata("https://data.cityofchicago.org/Public-Safety/Crimes-2020/qzdf-xmn8") %>%  #change some filtering 
    filter(Primary.Type == "NARCOTICS") %>%
    mutate(x = gsub("[()]", "", Location)) %>%
    separate(x,into= c("Y","X"), sep=",") %>%
    mutate(X = as.numeric(X),Y = as.numeric(Y)) %>% 
    na.omit() %>%
    st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant")%>%
    st_transform('ESRI:102271') %>% 
    distinct()
#socrata data download platform
chicagoBoundary <- 
  st_read(file.path(root.dir,"/Chapter5/chicagoBoundary.geojson")) %>%
  st_transform('ESRI:102271') 

A map of the outcome of interest

This is a narcotics crime map of Chicago in 2017.
It is argued that these racial disparities in drug sanctioning are better explained by the policy decision to dramatically increase the number of arrests of street-level drug offenders, the more public nature of African-American drug offending, and cultural stereotypes that link African-Americans to drug crimes.

After creating a fishnet…

## using {sf} to create the grid
## Note the `.[chicagoBoundary] %>% ` line. This is needed to clip the grid to our data
fishnet <- 
  st_make_grid(chicagoBoundary,
               cellsize = 500,  # in meters 10
               square = TRUE) %>%
  .[chicagoBoundary] %>%            # fast way to select intersecting polygons
  st_sf() %>%
  mutate(uniqueID = 1:n())

Map of the outcome joined to the fishnet.

## add a value of 1 to each crime, sum them with aggregate
crime_net <- 
  dplyr::select(crimes) %>% 
  mutate(countCrimes = 1) %>% 
  aggregate(., fishnet, sum) %>%
  mutate(countCrimes = replace_na(countCrimes, 0),
         uniqueID = 1:n(),
         cvID = sample(round(nrow(fishnet) / 24),  #
                       size=nrow(fishnet), replace = TRUE))

ggplot() +
  geom_sf(data = crime_net, aes(fill = countCrimes), color = NA) +
  scale_fill_viridis() +
  labs(title = "Count of Crimes for the fishnet") +
  mapTheme()

Then downloading all Modeling Spatial Features…

## only pulling a single variable for our model to keep it simple
## using Socrata again
abandonCars <- 
  read.socrata("https://data.cityofchicago.org/Service-Requests/311-Service-Requests-Abandoned-Vehicles/3c9v-pnva") %>%
    mutate(year = substr(creation_date,1,4)) %>% filter(year == "2017") %>%
    dplyr::select(Y = latitude, X = longitude) %>%
    na.omit() %>%
    st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
    st_transform(st_crs(fishnet)) %>%
    mutate(Legend = "Abandoned_Cars")

abandonBuildings <- 
  read.socrata("https://data.cityofchicago.org/Service-Requests/311-Service-Requests-Vacant-and-Abandoned-Building/7nii-7srd") %>%
    mutate(year = substr(date_service_request_was_received,1,4)) %>%  filter(year == "2017") %>%
    dplyr::select(Y = latitude, X = longitude) %>%
    na.omit() %>%
    st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
    st_transform(st_crs(fishnet)) %>%
    mutate(Legend = "Abandoned_Buildings")


shotspotterAlerts <- 
  read.socrata("https://data.cityofchicago.org/Public-Safety/Violence-Reduction-Shotspotter-Alerts/3h7q-7mdb") %>%
    mutate(year = substr(date,1,4)) %>% filter(year == "2017") %>%
    dplyr::select(Y = latitude, X = longitude) %>%
    na.omit() %>%
    st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
    st_transform(st_crs(fishnet)) %>%
    mutate(Legend = "Shotspotter_Alerts")

envcomplaint <- 
  read.socrata("https://data.cityofchicago.org/Environment-Sustainable-Development/CDPH-Environmental-Complaints/fypr-ksnz") %>%
    mutate(year = substr(complaint_date,1,4)) %>% filter(year == "2017") %>%
    dplyr::select(Y = latitude, X = longitude) %>%
    na.omit() %>%
    st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
    st_transform(st_crs(fishnet)) %>%
    mutate(Legend = "Environment_Complaints")



## Neighborhoods to use in LOOCV in a bit
neighborhoods <- 
  st_read("https://raw.githubusercontent.com/blackmad/neighborhoods/master/chicago.geojson") %>%
  st_transform(st_crs(fishnet)) 

and aggregate features to our fishnet…

vars_net <- 
  rbind(abandonCars,abandonBuildings,
        envcomplaint, shotspotterAlerts) %>%
  st_join(fishnet, join=st_within) %>%
  st_drop_geometry() %>%
  group_by(uniqueID, Legend) %>%
  summarize(count = n()) %>% #important
  left_join(fishnet, ., by = "uniqueID") %>%
  spread(Legend, count, fill=0) %>%
  dplyr::select(-`<NA>`) %>%
  na.omit() %>%
  ungroup()   #will influence other operation

A small multiple map of your risk factors in the fishnet

vars_net.long <- 
  gather(vars_net, Variable, value, -geometry, -uniqueID)

vars <- unique(vars_net.long$Variable)
mapList <- list()

for(i in vars){
  mapList[[i]] <- 
    ggplot() +
      geom_sf(data = filter(vars_net.long, Variable == i), aes(fill=value), colour=NA) +
      scale_fill_viridis(name="") +
      labs(title=i) +
      mapTheme()}

do.call(grid.arrange,c(mapList, ncol=2,nrow=2, top="Risk Factors by Fishnet"))

add Nearest Neighbor Feature…

st_c    <- st_coordinates
st_coid <- st_centroid

## create NN from abandoned cars
vars_net <- vars_net %>%
    mutate(
      Abandoned_Buildings.fe =
        nn_function(st_c(st_coid(vars_net)), st_c(abandonBuildings),k=3),
      Abandoned_Cars.fe =
        nn_function(st_c(st_coid(vars_net)), st_c(abandonCars),k=3),
      shotspotter_alert.fe =
        nn_function(st_c(st_coid(vars_net)), st_c(shotspotterAlerts),k=3),
      env_complaints.fe =
        nn_function(st_c(st_coid(vars_net)), st_c(envcomplaint),k=3))
## Visualize the NN feature
vars_net_textbook <-
  dplyr::select(vars_net, ends_with("s.fe")) %>%
    gather(Variable, value, -geometry)

vars_net_add <-
  dplyr::select(vars_net, ends_with("t.fe")) %>%
    gather(Variable, value, -geometry)

ggplot() +
      geom_sf(data = vars_net_textbook, aes(fill=value), colour=NA) +
      scale_fill_viridis(name="NN Distance") +
      labs(title="Abandoned Car NN Distance") +
      facet_wrap(~Variable) +
      mapTheme()

ggplot() +
      geom_sf(data = vars_net_add, aes(fill=value), colour=NA) +
      scale_fill_viridis(name="NN Distance") +
      labs(title="Abandoned Car NN Distance") +
      facet_wrap(~Variable) +
      mapTheme()

Join NN feature to our fishnet…

Since the counts were aggregated to each cell by uniqueID we can use that to join the counts to the fishnet.

## important to drop the geometry from joining features
final_net <-
  left_join(crime_net, st_drop_geometry(vars_net), by="uniqueID") 

Join in areal data…

final_net <-
  st_centroid(final_net) %>%
    st_join(dplyr::select(neighborhoods, name), by = "uniqueID") %>%  # if touch, add name
    st_join(dplyr::select(policeDistricts, District), by = "uniqueID") %>%
      st_drop_geometry() %>%
      left_join(dplyr::select(final_net, geometry, uniqueID)) %>%
      st_sf() %>%
  na.omit()

# for live demo
 mapview::mapview(final_net, zcol = "District")
## generates warnings from PROJ issues
## {spdep} to make polygon to neighborhoods... 
final_net.nb <- poly2nb(as_Spatial(final_net), queen=TRUE)
## ... and neighborhoods to list of weigths
final_net.weights <- nb2listw(final_net.nb, style="W", zero.policy=TRUE) 

#print(final_net.weights, zero.policy=TRUE)
## see ?localmoran
local_morans <- localmoran(final_net$countCrimes, final_net.weights, zero.policy=TRUE) %>% 
  as.data.frame()



# join local Moran's I results to fishnet
final_net.localMorans <- 
  cbind(local_morans, as.data.frame(final_net)) %>% 
  st_sf() %>%
  dplyr::select(crime_count = countCrimes, 
                Local_Morans_I = Ii, 
                P_Value = `Pr(z != E(Ii))`) %>%
  mutate(Significant_Hotspots = ifelse(P_Value <= 0.001, 1, 0)) %>%
  gather(Variable, Value, -geometry)

##Plotting local Moran’s I results…

## This is just for plotting
vars <- unique(final_net.localMorans$Variable)
varList <- list()

for(i in vars){
  varList[[i]] <- 
    ggplot() +
      geom_sf(data = filter(final_net.localMorans, Variable == i), 
              aes(fill = Value), colour=NA) +
      scale_fill_viridis(name="") +
      labs(title=i) +
      mapTheme(title_size = 14) + theme(legend.position="bottom")}

do.call(grid.arrange,c(varList, ncol = 4, top = "Local Morans I statistics, Crimes"))

Distance to Hot spot…

Using NN distance to a hot spot location

# generates warning from NN
final_net <- final_net %>%
  mutate(crime.isSig =
           ifelse(local_morans[,5] <= 0.001, 1, 0)) %>%
  mutate(crime.isSig.dist =
           nn_function(st_c(st_coid(final_net)),
                       st_c(st_coid(filter(final_net,
                                           crime.isSig == 1))),
                       k = 1))

Plot NN distance to hot spot

ggplot() +
      geom_sf(data = final_net, aes(fill=crime.isSig.dist), colour=NA) +
      scale_fill_viridis(name="significant distance") +
      labs(title="distance to highly significant hot pot") +
      mapTheme()

Modeling and CV

Leave One Group Out CV on spatial features # modeling start

# View(crossValidate)

## define the variables we want
reg.ss.vars <- c("Abandoned_Cars.fe","Abandoned_Buildings.fe","shotspotter_alert.fe","env_complaints.fe", "crime.isSig.dist")

## RUN REGRESSIONS
reg.ss.spatialCV <- crossValidate(
  dataset = final_net,
  id = "name",                           
  dependentVariable = "countCrimes",
  indVariables = reg.ss.vars) %>%
    dplyr::select(cvID = name, countCrimes, Prediction, geometry)

Table of MAE and standard deviation MAE by regression.

# calculate errors by NEIGHBORHOOD
error_by_reg_and_fold <- 
  reg.ss.spatialCV %>%
    group_by(cvID) %>% 
    summarize(Mean_Error = mean(Prediction - countCrimes, na.rm = T),
              MAE = mean(abs(Mean_Error), na.rm = T),
              SD_MAE = mean(abs(Mean_Error), na.rm = T)) %>%
  ungroup()

error_by_reg_and_fold %>% 
  arrange(desc(MAE))
## Simple feature collection with 96 features and 4 fields
## Geometry type: GEOMETRY
## Dimension:     XY
## Bounding box:  xmin: 341164.6 ymin: 552874.5 xmax: 367164.6 ymax: 594874.5
## Projected CRS: NAD_1983_HARN_StatePlane_Illinois_East_FIPS_1201
## # A tibble: 96 × 5
##    cvID              Mean_Error   MAE SD_MAE                            geometry
##    <chr>                  <dbl> <dbl>  <dbl>                       <POLYGON [m]>
##  1 Garfield Park         -25.7  25.7   25.7  ((350164.6 577874.5, 349664.6 5778…
##  2 Ukrainian Village      12.0  12.0   12.0  ((353164.6 580874.5, 353164.6 5813…
##  3 Humboldt Park          -9.87  9.87   9.87 ((349164.6 581874.5, 349164.6 5823…
##  4 Loop                   -7.04  7.04   7.04 ((357664.6 579374.5, 357664.6 5798…
##  5 Printers Row           -6.42  6.42   6.42 ((358664.6 577374.5, 358164.6 5773…
##  6 United Center           5.94  5.94   5.94 ((353164.6 579374.5, 353164.6 5798…
##  7 Mckinley Park           5.49  5.49   5.49 ((353664.6 573874.5, 354164.6 5738…
##  8 North Lawndale         -5.07  5.07   5.07 ((349164.6 576874.5, 349164.6 5773…
##  9 Lower West Side         4.58  4.58   4.58 ((353664.6 574874.5, 353664.6 5753…
## 10 Wicker Park             4.46  4.46   4.46 ((353164.6 582374.5, 353164.6 5828…
## # … with 86 more rows
error_by_reg_and_fold %>% 
  arrange(MAE)
## Simple feature collection with 96 features and 4 fields
## Geometry type: GEOMETRY
## Dimension:     XY
## Bounding box:  xmin: 341164.6 ymin: 552874.5 xmax: 367164.6 ymax: 594874.5
## Projected CRS: NAD_1983_HARN_StatePlane_Illinois_East_FIPS_1201
## # A tibble: 96 × 5
##    cvID                  Mean_Error     MAE  SD_MAE                     geometry
##    <chr>                      <dbl>   <dbl>   <dbl>               <GEOMETRY [m]>
##  1 West Loop               -0.00187 0.00187 0.00187 POLYGON ((355164.6 579874.5…
##  2 Norwood Park            -0.00448 0.00448 0.00448 MULTIPOLYGON (((342164.6 58…
##  3 Jefferson Park          -0.0140  0.0140  0.0140  POLYGON ((345164.6 589874.5…
##  4 North Park               0.0224  0.0224  0.0224  POLYGON ((350164.6 589374.5…
##  5 Hegewisch                0.0309  0.0309  0.0309  POLYGON ((364164.6 552874.5…
##  6 Avalon Park             -0.0319  0.0319  0.0319  POLYGON ((361164.6 563874.5…
##  7 Museum Campus            0.0460  0.0460  0.0460  MULTIPOLYGON (((359664.6 57…
##  8 South Deering            0.0606  0.0606  0.0606  POLYGON ((362164.6 554874.5…
##  9 Sauganash,Forest Glen    0.0808  0.0808  0.0808  POLYGON ((346664.6 591874.5…
## 10 Clearing                 0.0868  0.0868  0.0868  POLYGON ((344164.6 568374.5…
## # … with 86 more rows
## plot histogram of OOF (out of fold) errors
error_by_reg_and_fold %>%
  ggplot(aes(MAE)) + 
    geom_histogram(bins = 30, colour="black", fill = "#FDE725FF") +
  scale_x_continuous(breaks = seq(0, 11, by = 1)) + 
    labs(title="Distribution of MAE", subtitle = "LOGO-CV",
         x="Mean Absolute Error", y="Count") 

# demo of kernel width
crime_ppp <- as.ppp(st_coordinates(crimes), W = st_bbox(final_net))
crime_KD.1000 <- spatstat.core::density.ppp(crime_ppp, 1000)
crime_KD.1500 <- spatstat.core::density.ppp(crime_ppp, 1500)
crime_KD.2000 <- spatstat.core::density.ppp(crime_ppp, 2000)
crime_KD.df <- rbind(
  mutate(data.frame(rasterToPoints(mask(raster(crime_KD.1000), as(neighborhoods, 'Spatial')))), Legend = "1000 Ft."),
  mutate(data.frame(rasterToPoints(mask(raster(crime_KD.1500), as(neighborhoods, 'Spatial')))), Legend = "1500 Ft."),
  mutate(data.frame(rasterToPoints(mask(raster(crime_KD.2000), as(neighborhoods, 'Spatial')))), Legend = "2000 Ft.")) 

crime_KD.df$Legend <- factor(crime_KD.df$Legend, levels = c("1000 Ft.", "1500 Ft.", "2000 Ft."))

ggplot(data=crime_KD.df, aes(x=x, y=y)) +
  geom_raster(aes(fill=layer)) + 
  facet_wrap(~Legend) +
  coord_sf(crs=st_crs(final_net)) + 
  scale_fill_viridis(name="Density") +
  labs(title = "Kernel density with 3 different search radii") +
  mapTheme(title_size = 14)

as.data.frame(crime_KD.1000) %>%
  st_as_sf(coords = c("x", "y"), crs = st_crs(final_net)) %>%
  aggregate(., final_net, mean) %>%
   ggplot() +
     geom_sf(aes(fill=value)) +
     geom_sf(data = sample_n(crimes, 1500), size = .5) +
     scale_fill_viridis(name = "Density") +
     labs(title = "Kernel density of 2017 crimes") +
     mapTheme(title_size = 14)

correlation.long <-
  st_drop_geometry(final_net) %>%
    dplyr::select(-uniqueID, -cvID, -name, -District) %>%
    gather(Variable, Value, -countCrimes)

correlation.cor <-
  correlation.long %>%
    group_by(Variable) %>%
    summarize(correlation = cor(Value, countCrimes, use = "complete.obs"))
    
ggplot(correlation.long, aes(Value, countCrimes)) +
  geom_point(size = 0.1) +
  geom_text(data = correlation.cor, aes(label = paste("r =", round(correlation, 2))),
            x=-Inf, y=Inf, vjust = 1.5, hjust = -.1) +
  geom_smooth(method = "lm", se = FALSE, colour = "black") +
  facet_wrap(~Variable, ncol = 2, scales = "free") +
  labs(title = "Crime count as a function of risk factors") +
  plotTheme()

reg.vars <- c("Abandoned_Buildings.fe", "Abandoned_Cars.fe","shotspotter_alert.fe",
                 "env_complaints.fe","crime.isSig", "crime.isSig.dist")

reg.ss.vars <- c("Abandoned_Buildings.fe", "Abandoned_Cars.fe","shotspotter_alert.fe",
                 "env_complaints.fe","crime.isSig", "crime.isSig.dist")

reg.cv <- crossValidate(
  dataset = final_net,
  id = "cvID",
  dependentVariable = "countCrimes",
  indVariables = reg.vars) %>%
    dplyr::select(cvID = cvID, countCrimes, Prediction, geometry)

reg.ss.cv <- crossValidate(
  dataset = final_net,
  id = "cvID",
  dependentVariable = "countCrimes",
  indVariables = reg.ss.vars) %>%
    dplyr::select(cvID = cvID, countCrimes, Prediction, geometry)
  
reg.spatialCV <- crossValidate(
  dataset = final_net,
  id = "name",
  dependentVariable = "countCrimes",
  indVariables = reg.vars) %>%
    dplyr::select(cvID = name, countCrimes, Prediction, geometry)

reg.ss.spatialCV <- crossValidate(
  dataset = final_net,
  id = "name",
  dependentVariable = "countCrimes",
  indVariables = reg.ss.vars) %>%
    dplyr::select(cvID = name, countCrimes, Prediction, geometry)
reg.summary <- 
  rbind(
    mutate(reg.cv,           Error = Prediction - countCrimes,
                             Regression = "Random k-fold CV: Just Risk Factors"),
                             
    mutate(reg.ss.cv,        Error = Prediction - countCrimes,
                             Regression = "Random k-fold CV: Spatial Process"),
    
    mutate(reg.spatialCV,    Error = Prediction - countCrimes,
                             Regression = "Spatial LOGO-CV: Just Risk Factors"),
                             
    mutate(reg.ss.spatialCV, Error = Prediction - countCrimes,
                             Regression = "Spatial LOGO-CV: Spatial Process")) %>%
    st_sf() 

error_by_reg_and_fold <- 
  reg.summary %>%
    group_by(Regression, cvID) %>% 
    summarize(Mean_Error = mean(Prediction - countCrimes, na.rm = T),
              MAE = mean(abs(Mean_Error), na.rm = T),
              SD_MAE = mean(abs(Mean_Error), na.rm = T)) %>%
  ungroup()

error_by_reg_and_fold %>%
  ggplot(aes(MAE)) + 
    geom_histogram(bins = 30, colour="black", fill = "#FDE725FF") +
    facet_wrap(~Regression) +  
    geom_vline(xintercept = 0) + scale_x_continuous(breaks = seq(0, 8, by = 1)) + 
    labs(title="Distribution of MAE", subtitle = "k-fold cross validation vs. LOGO-CV",
         x="Mean Absolute Error", y="Count") +
    plotTheme()

## A table of raw errors by race context for a random k-fold vs. spatial cross validation regression.

st_drop_geometry(error_by_reg_and_fold) %>%
  group_by(Regression) %>% 
    summarize(Mean_MAE = round(mean(MAE), 2),
              SD_MAE = round(sd(MAE), 2)) %>%
  kable() %>%
    kable_styling("striped", full_width = F) %>%
    row_spec(2, color = "black", background = "#FDE725FF") %>%
    row_spec(4, color = "black", background = "#FDE725FF") 
Regression Mean_MAE SD_MAE
Random k-fold CV: Just Risk Factors 1.35 1.74
Random k-fold CV: Spatial Process 1.35 1.74
Spatial LOGO-CV: Just Risk Factors 1.23 1.33
Spatial LOGO-CV: Spatial Process 1.23 1.33
error_by_reg_and_fold %>%
  filter(str_detect(Regression, "LOGO")) %>%
  ggplot() +
    geom_sf(aes(fill = MAE)) +
    facet_wrap(~Regression) +
    scale_fill_viridis() +
    labs(title = "Crime errors by LOGO-CV Regression") +
    mapTheme() + theme(legend.position="bottom")

neighborhood.weights <-
  filter(error_by_reg_and_fold, Regression == "Spatial LOGO-CV: Spatial Process") %>%
    group_by(cvID) %>%
      poly2nb(as_Spatial(.), queen=TRUE) %>%
      nb2listw(., style="W", zero.policy=TRUE)

filter(error_by_reg_and_fold, str_detect(Regression, "LOGO"))  %>% 
    st_drop_geometry() %>%
    group_by(Regression) %>%
    summarize(Morans_I = moran.mc(abs(Mean_Error), neighborhood.weights, 
                                 nsim = 999, zero.policy = TRUE, 
                                 na.action=na.omit)[[1]],
              p_value = moran.mc(abs(Mean_Error), neighborhood.weights, 
                                 nsim = 999, zero.policy = TRUE, 
                                 na.action=na.omit)[[3]])
## # A tibble: 2 × 3
##   Regression                         Morans_I p_value
##   <chr>                                 <dbl>   <dbl>
## 1 Spatial LOGO-CV: Just Risk Factors    0.184   0.006
## 2 Spatial LOGO-CV: Spatial Process      0.184   0.005

Get 2018 crime data…

crimes18 <- 
  read.socrata("https://data.cityofchicago.org/Public-Safety/Crimes-2018/3i3m-jwuy") %>% 
  filter(Primary.Type == "NARCOTICS") %>%
  mutate(x = gsub("[()]", "", Location)) %>%
  separate(x,into= c("Y","X"), sep=",") %>%
  mutate(X = as.numeric(X),
         Y = as.numeric(Y)) %>% 
  na.omit %>%
  st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
  st_transform('ESRI:102271') %>% 
  distinct() %>%
  .[fishnet,]
crime_KDE_sum <- as.data.frame(crime_KD.1000) %>%
  st_as_sf(coords = c("x", "y"), crs = st_crs(final_net)) %>%
  aggregate(., final_net, mean) 
kde_breaks <- classIntervals(crime_KDE_sum$value, 
                             n = 5, "fisher")
crime_KDE_sf <- crime_KDE_sum %>%
  mutate(label = "Kernel Density",
         Risk_Category = classInt::findCols(kde_breaks),
         Risk_Category = case_when(
           Risk_Category == 5 ~ "5th",
           Risk_Category == 4 ~ "4th",
           Risk_Category == 3 ~ "3rd",
           Risk_Category == 2 ~ "2nd",
           Risk_Category == 1 ~ "1st")) %>%
  cbind(
    aggregate(
      dplyr::select(crimes18) %>% mutate(crimeCount = 1), ., sum) %>%
    mutate(crimeCount = replace_na(crimeCount, 0))) %>%
  dplyr::select(label, Risk_Category, crimeCount)
ml_breaks <- classIntervals(reg.ss.spatialCV$Prediction, 
                             n = 5, "fisher")
crime_risk_sf <-
  reg.ss.spatialCV %>%
  mutate(label = "Risk Predictions",
         Risk_Category =classInt::findCols(ml_breaks),
         Risk_Category = case_when(
           Risk_Category == 5 ~ "5th",
           Risk_Category == 4 ~ "4th",
           Risk_Category == 3 ~ "3rd",
           Risk_Category == 2 ~ "2nd",
           Risk_Category == 1 ~ "1st")) %>%
  cbind(
    aggregate(
      dplyr::select(crimes18) %>% mutate(crimeCount = 1), ., sum) %>%
      mutate(crimeCount = replace_na(crimeCount, 0))) %>%
  dplyr::select(label,Risk_Category, crimeCount)

The map comparing kernel density to risk predictions for the 2018 crime.

rbind(crime_KDE_sf, crime_risk_sf) %>%
  na.omit() %>%
  gather(Variable, Value, -label, -Risk_Category, -geometry) %>%
  ggplot() +
    geom_sf(aes(fill = Risk_Category), colour = NA) +
    geom_sf(data = sample_n(crimes18, 3000), size = .5, colour = "black") +
    facet_wrap(~label, ) +
    scale_fill_viridis(discrete = TRUE) +
    labs(title="Comparison of Kernel Density and Risk Predictions",
         subtitle="2017 crime risk predictions; 2018 crimes") +
    mapTheme(title_size = 14)

The bar plot making this comparison.

rbind(crime_KDE_sf, crime_risk_sf) %>%
  st_drop_geometry() %>%
  na.omit() %>%
  gather(Variable, Value, -label, -Risk_Category) %>%
  group_by(label, Risk_Category) %>%
  summarize(countCrimes = sum(Value)) %>%
  ungroup() %>%
  group_by(label) %>%
  mutate(Pcnt_of_test_set_crimes = countCrimes / sum(countCrimes)) %>%
    ggplot(aes(Risk_Category,Pcnt_of_test_set_crimes)) +
      geom_bar(aes(fill=label), position="dodge", stat="identity") +
      scale_fill_viridis(discrete = TRUE, name = "Model") +
      labs(title = "Risk prediction vs. Kernel density, 2018 crimes",
           y = "% of Test Set crimes (per model)",
           x = "Risk Category") +
  theme_bw() +
      theme(axis.text.x = element_text(angle = 45, vjust = 0.5))

Conclusion

The existing empirical evidence suggests that the racial disparities exacerbated by the war on drugs are more likely due to political expediency and racialized politics. It is argued that these racial disparities in drug sanctioning are better explained by the policy decision to dramatically increase the number of arrests of street-level drug offenders, the more public nature of African-American drug offending, and cultural stereotypes that link African-Americans to drug crimes.
A geospatial risk prediction model borrows the narcotics experience in places where it has been observed and tests whether that experience generalizes to places where risk may be high, despite few actual events. It’s pretty sure the model suffers from some level of bias. If law enforcement systematically over-polices certain communities, and this selection criteria goes unaccounted for in the model, then the model may be biased regardless of the above tests.
I would not recommend this algorithm be put into production. There are questions in the Textbook: “What if the $10 million in savings lead police to increase enforcement and surveillance disproportionately in Black and Brown communities? Worse, what about feedback effects where steering police to these neighborhoods cause more reported crime, which then leads to increased predicted risk?”
I agree that the right approach is a function of community standards. Machine learning could be the next logical progression in analytics, but it also needs to avoid being a tool of the surveillance state.