Population of Canada within 100 miles of the US border, with Open Data and R!

plot of chunk unnamed-chunk-6

Introduction

This tutorial uses the sf package to perform geoprocessing, and ggplot to visualize the data. Additionally, we practice downloading open-source data from the internet.

Adjust Settings, Load Packages

knitr::opts_chunk$set(echo = TRUE)
for(p in c("ggplot2","sf","dplyr","plyr","raster")){library(p,character.only = T)}
directory=".../Layers_Canada/CanadaPopulationDensity/"
theme_set(theme_bw()+
            theme(
              panel.background = element_blank(),axis.ticks = element_blank(),
              legend.background = element_blank(),panel.grid = element_line(colour = 0)
            ))

Download Open-Source data from open.canada.ca

setwd(directory)
if(!("CAdata.zip" %in% list.files())){
  #https://open.canada.ca/data/en/dataset/5be03a46-8504-40a7-a96c-af195bae0428
  download.file(url = "http://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/gecu000e11a_e.zip",
                dest=paste0(directory,"CAdata.zip"))
}
if(!("CAcensusdata" %in% list.files())){unzip(zipfile = "CAdata.zip",exdir = "CAcensusdata")}
#https://open.canada.ca/data/en/dataset/ece81c43-aa4e-41ef-86c2-3835eb5aa95c
download.file(url = "http://www12.statcan.gc.ca/census-recensement/2016/dp-pd/hlt-fst/pd-pl/Tables/CompFile.cfm?Lang=Eng&T=701&OFT=FULLCSV",dest=paste0(directory,"CAcensusdata/CensusData.csv"))
if(!("Borders.zip" %in% list.files())){
  #http://thematicmapping.org/downloads/world_borders.php
  download.file(url = "http://thematicmapping.org/downloads/TM_WORLD_BORDERS-0.3.zip",dest=paste0(directory,"Borders.zip"))
}
setwd(setwd(directory))
if(!("WorldBorders" %in% list.files())){unzip(zipfile = "Borders.zip",exdir = "WorldBorders")}

Inspect the Canadian Census data from 2016.

We see that population within census divisions is log-normally distributed. We can also confirm that the population density calculations within each census division match the population and area values. The population of Canada in 2016 was 35.2 million people.
census=read.csv(paste0(directory,"CAcensusdata/CensusData.csv"))
ggplot(data=census)+
  geom_histogram(aes(x=Population..2016))+
  scale_x_log10()
plot of chunk unnamed-chunk-3
# Confirm that Population Density calculations match areas
ggplot(data=census)+
  geom_point(aes(y=Population..2016,x=Land.area.in.square.kilometres..2016*Population.density.per.square.kilometre..2016))+
  geom_abline(intercept=0,slope=1)
plot of chunk unnamed-chunk-3
print(paste(round(sum(census$Population..2016,na.rm=TRUE)*10^-6,1),"million people"))
## [1] "35.2 million people"

Inspect GIS Data

proj="+proj=aea +lat_1=50 +lat_2=70 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs "

CAshp=st_read(paste0(directory,"CAcensusdata/gcd_000e11a_e.shp"))
## Reading layer `gcd_000e11a_e' from data source `...\Layers_Canada\CanadaPopulationDensity\CAcensusdata\gcd_000e11a_e.shp' using driver `ESRI Shapefile'
## Simple feature collection with 293 features and 5 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -141.0181 ymin: 41.71096 xmax: -52.61941 ymax: 83.1355
## epsg (SRID):    4269
## proj4string:    +proj=longlat +datum=NAD83 +no_defs
CAshp=st_transform(CAshp,proj)
CAshp=dplyr::left_join(CAshp,census,by=c("CDUID"="Geographic.code"))

ggplot()+
  geom_sf(data=CAshp)+
  ggtitle("Canada Census Divisions")
plot of chunk unnamed-chunk-4
borders=st_read(paste0(directory,"WorldBorders/TM_WORLD_BORDERS-0.3.shp"))
## Reading layer `TM_WORLD_BORDERS-0.3' from data source `...\Layers_Canada\CanadaPopulationDensity\WorldBorders\TM_WORLD_BORDERS-0.3.shp' using driver `ESRI Shapefile'
## Simple feature collection with 246 features and 11 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -180 ymin: -90 xmax: 180 ymax: 83.6236
## epsg (SRID):    4326
## proj4string:    +proj=longlat +datum=WGS84 +no_defs
USA=borders[borders$NAME=="United States",c("NAME","geometry")]
USA=st_transform(USA,proj)
st_crs(USA)$units
## [1] "m"
USA_buf=st_buffer(USA,dist=160934) # 100 miles = 160934 meters

ggplot()+
  geom_sf(data=USA,fill="gray",col="transparent")+
  geom_sf(data=USA_buf,fill="transparent",col=2)+
  ggtitle("US Border and 100 mile buffer")
plot of chunk unnamed-chunk-4

Inspect Population Density Data

We can see that the region within 100 miles of the US border is highly populated, but there are cities like Calgary, Edmonton, and Saskatoon that are north of this boundary.
ggplot()+
  geom_sf(data=CAshp,aes(fill=log10(Population.density.per.square.kilometre..2016)),col="transparent")+
  geom_sf(data=USA_buf,col=2,fill="transparent")+
  coord_sf(xlim = extent(CAshp)[1:2],ylim = extent(CAshp)[3:4])+
  ggtitle("Canadian Census Divisions and Population Density")+
  scale_fill_gradient(name="Pop_dens",low="darkblue",high="red",na.value="darkblue",
                      limits=c(-1.2,3.5),breaks=c(-1:3),labels=(10^c(-1:3)))
plot of chunk unnamed-chunk-5

Use st_intersection to extract census tracts

We use st_intersection to intersect the buffered US borer polygon and Canadian census tracts. We assume population is evenly distributed within census divisions, allowing us to adjust the size of intersected tracts for their new clipped size. Inspecting their new calculated areas, we see that many tracts are completely within the 100 mile buffer, but others have a much lower area.
Our final calculation finds that 27.3 million people live within 100 miles of the US border, or 77.7% of the population.
intersect=st_intersection(CAshp,USA_buf)
plot=ggplot()+
  geom_sf(data=CAshp,fill="gray",col="transparent")+
  geom_sf(data=intersect,aes(fill=log10(Population.density.per.square.kilometre..2016)),col="transparent")+
  ggtitle("Population of Canada within 100 miles of the US border")+
  scale_fill_gradient(name=expression(paste("Persons km",NA^-2)),
                      low="darkblue",high="red",na.value="darkblue",
                      limits=c(-1.2,3.5),breaks=c(-1:3),labels=(10^c(-1:3)))+
  theme(legend.position = c(0.85,0.8),axis.ticks = element_blank(),
        axis.text = element_blank(),axis.title = element_blank())
ggsave(plot,filename = paste0(directory,"BufferPopDens.png"),width = 5,height=4,dpi=300,units="in")
plot
plot of chunk unnamed-chunk-6
intersect=intersect%>%mutate(area_m2 = st_area(.) %>% as.numeric())
intersect$area_km2=intersect$area_m2*(1000^-2)

ggplot(data=intersect)+
  geom_point(aes(y=area_km2,x=Land.area.in.square.kilometres..2016))+
  scale_y_log10()+scale_x_log10()+
  geom_abline(intercept=0,slope=1)
plot of chunk unnamed-chunk-6
intersect$pop_2016_intersect=intersect$area_km2*intersect$Population.density.per.square.kilometre..2016
paste0(round(sum(intersect$pop_2016_intersect)*10^-6,1)," million people")
## [1] "27.3 million people"
print(paste0(round(100*sum(intersect$pop_2016_intersect,na.rm=TRUE)/sum(census$Population..2016,na.rm=TRUE),1),"% of population within 100 miles of the US border"))
## [1] "77.7% of population within 100 miles of the US border"

What about Canada's population in relation to some US city? Seattle?

lon=seq(-140,-45,2)
df <- data.frame(lon=c(lon,rev(lon)),lat=c(rep(47.6,length(lon)),rep(30,length(lon))))
df <- rbind(df, df[1,])
seattle <- st_sf(st_sfc(st_polygon(list(as.matrix(df)))), crs = 4326)
seattle=st_transform(seattle,proj)

intersect=st_intersection(seattle,CAshp)
intersect=intersect%>%mutate(area_m2 = st_area(.) %>% as.numeric())
intersect$area_km2=intersect$area_m2*(1000^-2)
intersect$pop_2016_intersect=intersect$area_km2*intersect$Population.density.per.square.kilometre..2016


ggplot()+
  geom_sf(data=CAshp,fill="gray",col="transparent")+
  geom_sf(data=intersect,aes(fill=log10(Population.density.per.square.kilometre..2016)),col="transparent")+
  geom_sf(data=seattle,col=2,fill="transparent")+
  coord_sf(xlim = extent(CAshp)[1:2],ylim = extent(CAshp)[3:4])+
  ggtitle("Population of Canada south of Seattle, Washington")+
  scale_fill_gradient(name=expression(paste("Persons km",NA^-2)),
                      low="darkblue",high="red",na.value="darkblue",
                      limits=c(-1.2,3.5),breaks=c(-1:3),labels=(10^c(-1:3)))+
  theme(legend.position = c(0.85,0.8),axis.ticks = element_blank(),
        axis.text = element_blank(),axis.title = element_blank())
plot of chunk unnamed-chunk-7
paste0(round(sum(intersect$pop_2016_intersect)*10^-6,1)," million people")
## [1] "23.3 million people"
print(paste0(round(100*sum(intersect$pop_2016_intersect,na.rm=TRUE)/sum(census$Population..2016,na.rm=TRUE),1),"% of population south of Seattle, Washington State"))
## [1] "66.2% of population south of Seattle, Washington State"