-
Notifications
You must be signed in to change notification settings - Fork 0
/
border-states.R
226 lines (185 loc) · 8.92 KB
/
border-states.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
## 'Border' States
library(sf)
library(dplyr)
library(ggplot2)
library(ggrepel)
library(units)
library(readr)
library(rvest)
# Grab some US state shapefiles from the census bureau
# dir.create("data")
# Uncomment the next lines to download and unpack the data
# download.file("http://www2.census.gov/geo/tiger/GENZ2016/shp/cb_2016_us_state_20m.zip",
# "data/cb_2016_us_state_20m.zip")
# unzip("data/cb_2016_us_state_20m.zip", exdir = "data")
# Transform to a projection from long-lat form
us <- st_read("./data/cb_2016_us_state_20m.shp") %>%
st_transform("+init=epsg:26978") %>%
mutate(area = st_area(st_geometry(.)))
# 100 miles in meters
meters <- 160934.4
# An outline of the US (as a geometry set)
us_outline <- st_union(us)
# Buffer to get the US 100 miles inland from its borders
not_border_zone <- st_buffer(us_outline, dist = -meters) # negative => inside
border_zone <- st_difference(us_outline, not_border_zone)
# How much of the US is that anyway?
border_zone_area <- st_area(border_zone)
not_border_zone_area <- st_area(not_border_zone)
# Speak American please
set_units(border_zone_area, miles^2)
set_units(not_border_zone_area, miles^2)
# Apparently a lot of the US is 'border'
prop_us_covered <- as.numeric(border_zone_area /
(border_zone_area + not_border_zone_area))
prop_us_covered # yikes
# How much of each state is 'border' zone?
us_border <- st_intersection(us, border_zone)
us_border$area <- st_area(us_border) # correct area
st_geometry(us_border) <- NULL
border_areas_by_state <- left_join(us, us_border,
by = "NAME", suffix = c("", "_border")) %>%
mutate(prop = ifelse(is.na(area_border), 0, area_border / area),
state_name = factor(NAME, levels = NAME[order(prop)])) %>%
arrange(desc(prop))
# Plot the proportions
ggplot(border_areas_by_state, aes(x = state_name, y = 100 * prop)) +
geom_point() +
coord_flip() +
labs(y = "Percent",
x = "State / Territory",
title = "Percentage of each State or Territory in 'Border' Zone",
subtitle = sprintf("Percentage of Total Landmass: %.2f", prop_us_covered)) +
theme_minimal()
# ggsave("pics/border-zone-proportions-by-state.png",
# dpi = 300, width = 7, height = 7)
# How does this look with state populations rather than state areas?
#
# We can *approximate* the 'border' population using county-level population
# data: the number of the people in a 'border' zone in a state is the sum of the
# populations of all of its counties, weighted by the proportion of the
# county that is in the border zone. Clearly this will be a bad approximation
# when the county is big and the population is far from evenly distributed, and
# a better approximation when the county is small or the population is evenly
# distributed within it. A smaller population unit would help, but would make
# processing slower. County granularity is enough to get the idea.
# Here's some 2016 county level population data, also from the Census Bureau.
# Select the 'United States' link at
# https://www.census.gov/data/tables/2016/demo/popest/counties-total.html
# and download. When it lands it will be called PEP_2016_PEPANNRES.zip
# I shall assume you put it in the `data/` folder.
# unzip("data/PEP_2016_PEPANNRES.zip", exdir = "data")
state_pop <- read_csv("data/PEP_2016_PEPANNRES_with_ann.csv", skip = 2,
col_types = "_cc________n", col_names = FALSE)
names(state_pop) <- c("GEOID", "name", "pop2016")
# Unfortunately it doesn't have Puerto Rico counties in it. However, Puerto
# Rico is small enough to all be in the 'border' zone so we know where it's
# going to land
# Get shapefiles for counties from the Census Bureau
#
# download.file("http://www2.census.gov/geo/tiger/GENZ2016/shp/cb_2016_us_county_20m.zip",
# "data/cb_2016_us_county_20m.zip")
# unzip("data/cb_2016_us_county_20m.zip", exdir = "data")
# These shapes have FIPS codes instead of state names, so we'll grab a translation from
# https://en.wikipedia.org/wiki/Federal_Information_Processing_Standard_state_code
# and tidy it up with rvest
#
fips <- "https://en.wikipedia.org/wiki/Federal_Information_Processing_Standard_state_code"
state_fips <- read_html(fips) %>%
html_node("table") %>%
html_table %>%
setNames(c("state_name", "alpha", "STATEFP", "status")) %>%
mutate(STATEFP = sprintf("%02d", STATEFP))
# Now to read in the shapes and fold in the state names and the population info
counties <- st_read("./data/cb_2016_us_county_20m.shp") %>%
st_transform("+init=epsg:26978") %>%
filter(STATEFP != 72) %>% # Remove FIPS code 72 (Puerto Rico)
mutate(GEOID = as.character(GEOID),
STATEFP = as.character(STATEFP), # join keys as character
area = st_area(st_geometry(.))) %>%
inner_join(state_pop, by = "GEOID") %>% # fold in populations
left_join(state_fips, by = "STATEFP") # and state names
# Note that CB's ALAND (land area) is almost certainly better, but it's not clear
# how comparable it is to a crude polgon intersection to get border area...
# How bad is st_area?
plot(overlaps$ALAND, as.numeric(overlaps$area),
xlab = "ALAND from Census Bureau",
ylab = "Estimate from st_area(polygons)",
main = "Census Bureau County Areas vs Polygon County Areas")
overlaps %>%
arrange(desc(ALAND)) %>%
head(8) %>%
with(text(ALAND, area, state_name, pos = 2))
abline(a = 0, b = 1)
# Mostly we're wrong about Alaska and a bit of California
# Alaska's sparsely populated so let's see whether these errors will make a
# difference in more populous spots
with(overlaps, plot(pop2016, as.numeric(area) - ALAND, log = "x",
xlab = "Population (logged)",
ylab = "Area Estimation Error",
main = "Estimation Errors by Population"))
overlaps %>%
arrange(desc(as.numeric(area) - ALAND)) %>%
head(6) %>%
with(text(pop2016, as.numeric(area) - ALAND, name, pos = 2))
# The same overlap checking as before, but for smaller units
counties_outline <- st_union(counties)
not_border_zone <- st_buffer(counties_outline, dist = -meters)
border_zone <- st_difference(counties_outline, not_border_zone)
# compute overlaps and estimate border population
overlaps <- st_intersection(counties, border_zone) # slow - about 14s on my laptop
overlaps <- overlaps %>%
mutate(STATEFP = as.character(STATEFP), # for joining later
border_area = st_area(st_geometry(.)),
prop_overlap = as.numeric(border_area / area),
pop2016 = pop2016 * prop_overlap) # scale population down by overlap
border_population_us <- sum(overlaps$pop2016) / sum(counties$pop2016)
# aggregate county population up to state level
pops_by_state <- counties %>% # cont
group_by(state_name) %>%
summarise(population = sum(pop2016)) %>%
st_set_geometry(NULL)
# aggregate border county population up to state level
border_pops_by_state <- overlaps %>%
group_by(state_name) %>%
summarise(population = sum(pop2016)) %>%
st_set_geometry(NULL)
# adjust to add puerto rico population: 3411307 in 2016 according to the CB
pr_pop <- 3411307
border_pops_by_state <- data.frame(state_name = c(border_pops_by_state$state_name, "Puerto Rico"),
population = c(border_pops_by_state$population, pr_pop))
pops_by_state <- data.frame(state_name = c(pops_by_state$state_name, "Puerto Rico"),
population = c(pops_by_state$population, pr_pop))
border_population_us <- sum(border_pops_by_state$population) /
sum(pops_by_state$population)
# 0.6119658
border_pops_by_state <- left_join(pops_by_state, border_pops_by_state,
by = "state_name", suffix = c("", "_border")) %>%
mutate(population_border = ifelse(is.na(population_border), 0, population_border),
prop = population_border / population,
state_name = factor(state_name, levels = state_name[order(prop)])) %>%
arrange(desc(prop))
ggplot(border_pops_by_state, aes(x = state_name, y = 100 * prop)) +
geom_point() +
coord_flip() +
labs(y = "Percent",
x = "State",
title = "Estimated Percentage of Population in each State / Territory in 'Border' Zone",
subtitle = sprintf("Estimated Percentage of Total Population: %.2f",
border_population_us)) +
theme_minimal()
# ggsave("pics/border-zone-pop-proportions-by-state.png", dpi = 300, width = 7, height = 7)
# Here are the differences between the population and the state area proportions
differences <- left_join(border_areas_by_state, border_pops_by_state,
by = "state_name", suffix = c("_area", "_pop")) %>%
filter(!(prop_area %in% c(0.0, 1.0)))
ggplot(differences, aes(x = prop_area, y = prop_pop, label = state_name)) +
geom_point() +
geom_text_repel() +
geom_abline(alpha = 0.3) +
ylim(0,1) +
xlim(0,1) +
xlab("Percentage of State in 'Border' Zone") +
ylab("Estimated 'Border' Population Percentage of State") +
theme_minimal()
# ggsave("pics/border-zone-pop-area-diffs-by-state.png", dpi = 300, width = 7, height = 7)